diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index 5dbecc79..b1b38f84 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -64,7 +64,7 @@ jobs: - name: Install dependencies run: | python -m pip install --no-cache-dir --upgrade pip - pip install --no-cache-dir pytest numpy scipy py-cpuinfo pytest-repeat + pip install --no-cache-dir py-cpuinfo pytest pytest-repeat numpy scipy tabulate python -c "from cpuinfo import get_cpu_info; print(get_cpu_info())" - name: Build locally on Ubuntu diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e17fafe9..02918b9c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,9 +40,9 @@ cmake --build build_release --config Release Testing: ```sh -pip install -e . # to install the package in editable mode -pip install pytest pytest-repeat # testing dependencies -pytest python/test.py -s -x -Wd # to run tests +pip install -e . # to install the package in editable mode +pip install pytest pytest-repeat tabulate # testing dependencies +pytest python/test.py -s -x -Wd # to run tests # to check supported SIMD instructions: python -c "import simsimd; print(simsimd.get_capabilities())" diff --git a/README.md b/README.md index 0b5fb7b3..f1ffdf22 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,8 @@ SimSIMD provides an alternative. ## Features -__SimSIMD__ provides __over 200 SIMD-optimized kernels__ for various distance and similarity measures, accelerating search in [USearch](https://github.com/unum-cloud/usearch) and several DBMS products. +__SimSIMD__ (Arabic: "سيمسيم دي") is a library of __over 200 SIMD-optimized kernels__ for distance and similarity measures, boosting search performance in [USearch](https://github.com/unum-cloud/usearch) and several database systems. +Named after the iconic ["Open Sesame"](https://en.wikipedia.org/wiki/Open_sesame) command from _Ali Baba and the Forty Thieves_, it opens the doors to a modern treasure: maximizing the potential of today's hardware for high resource utilization. Implemented distance functions include: - Euclidean (L2) and Cosine (Angular) spatial distances for Vector Search. @@ -240,6 +241,18 @@ By default, the output distances will be stored in double-precision `f64` floati That behavior may not be space-efficient, especially if you are computing the hamming distance between short binary vectors, that will generally fit into 8x smaller `u8` or `u16` types. To override this behavior, use the `dtype` argument. +### Helper Functions + +You can turn specific backends on or off depending on the exact environment. +A common case may be avoiding AVX-512 on older AMD CPUs and [Intel Ice Lake](https://travisdowns.github.io/blog/2020/08/19/icl-avx512-freq.html) CPUs to ensure the CPU doesn't change the frequency license and throttle performance. + +```py +$ simsimd.get_capabilities() +> {'serial': True, 'neon': False, 'sve': False, 'neon_f16': False, 'sve_f16': False, 'neon_bf16': False, 'sve_bf16': False, 'neon_i8': False, 'sve_i8': False, 'haswell': True, 'skylake': True, 'ice': True, 'genoa': True, 'sapphire': True} +$ simsimd.disable_capability("sapphire") +$ simsimd.enable_capability("sapphire") +``` + ### Using Python API with USearch Want to use it in Python with [USearch](https://github.com/unum-cloud/usearch)? diff --git a/include/simsimd/curved.h b/include/simsimd/curved.h index feb49dd3..768021a3 100644 --- a/include/simsimd/curved.h +++ b/include/simsimd/curved.h @@ -25,8 +25,8 @@ #include "types.h" -#include "dot.h" // `simsimd_partial_load_f16x4_neon` and friends -#include "spatial.h" // `simsimd_substract_bf16x32_genoa` +#include "dot.h" // `_simsimd_partial_load_f16x4_neon` and friends +#include "spatial.h" // `_simsimd_substract_bf16x32_genoa` #ifdef __cplusplus extern "C" { @@ -250,9 +250,9 @@ SIMSIMD_PUBLIC void simsimd_bilinear_f16_neon(simsimd_f16_t const* a, simsimd_f1 simsimd_size_t tail_start = n - tail_length; if (tail_length) { for (simsimd_size_t i = 0; i != n; ++i) { - simsimd_f32_t a_i = vaddvq_f32(vcvt_f32_f16(simsimd_partial_load_f16x4_neon(a + i, 1))); - float32x4_t b_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(b + tail_start, tail_length)); - float32x4_t c_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(c + i * n + tail_start, tail_length)); + simsimd_f32_t a_i = vaddvq_f32(vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a + i, 1))); + float32x4_t b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b + tail_start, tail_length)); + float32x4_t c_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(c + i * n + tail_start, tail_length)); simsimd_f32_t partial_sum = vaddvq_f32(vmulq_f32(b_vec, c_vec)); sum += a_i * partial_sum; } @@ -286,13 +286,13 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_neon(simsimd_f16_t const* a, simsimd simsimd_size_t tail_start = n - tail_length; if (tail_length) { for (simsimd_size_t i = 0; i != n; ++i) { - simsimd_f32_t a_i = vaddvq_f32(vcvt_f32_f16(simsimd_partial_load_f16x4_neon(a + i, 1))); - simsimd_f32_t b_i = vaddvq_f32(vcvt_f32_f16(simsimd_partial_load_f16x4_neon(b + i, 1))); + simsimd_f32_t a_i = vaddvq_f32(vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a + i, 1))); + simsimd_f32_t b_i = vaddvq_f32(vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b + i, 1))); simsimd_f32_t diff_i = a_i - b_i; - float32x4_t a_j_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(a + tail_start, tail_length)); - float32x4_t b_j_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(b + tail_start, tail_length)); + float32x4_t a_j_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a + tail_start, tail_length)); + float32x4_t b_j_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b + tail_start, tail_length)); float32x4_t diff_j_vec = vsubq_f32(a_j_vec, b_j_vec); - float32x4_t c_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(c + i * n + tail_start, tail_length)); + float32x4_t c_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(c + i * n + tail_start, tail_length)); simsimd_f32_t partial_sum = vaddvq_f32(vmulq_f32(diff_j_vec, c_vec)); sum += diff_i * partial_sum; } @@ -333,8 +333,8 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_neon(simsimd_bf16_t const* a, simsimd_ if (tail_length) { for (simsimd_size_t i = 0; i != n; ++i) { simsimd_f32_t a_i = simsimd_uncompress_bf16(a + i); - bfloat16x8_t b_vec = simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length); - bfloat16x8_t c_vec = simsimd_partial_load_bf16x8_neon(c + i * n + tail_start, tail_length); + bfloat16x8_t b_vec = _simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length); + bfloat16x8_t c_vec = _simsimd_partial_load_bf16x8_neon(c + i * n + tail_start, tail_length); simsimd_f32_t partial_sum_high = vaddvq_f32(vbfmlaltq_f32(vdupq_n_f32(0), b_vec, c_vec)); simsimd_f32_t partial_sum_low = vaddvq_f32(vbfmlalbq_f32(vdupq_n_f32(0), b_vec, c_vec)); sum += a_i * (partial_sum_high + partial_sum_low); @@ -385,8 +385,8 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_neon(simsimd_bf16_t const* a, simsi simsimd_f32_t a_i = simsimd_uncompress_bf16(a + i); simsimd_f32_t b_i = simsimd_uncompress_bf16(b + i); simsimd_f32_t diff_i = a_i - b_i; - bfloat16x8_t a_j_vec = simsimd_partial_load_bf16x8_neon(a + tail_start, tail_length); - bfloat16x8_t b_j_vec = simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length); + bfloat16x8_t a_j_vec = _simsimd_partial_load_bf16x8_neon(a + tail_start, tail_length); + bfloat16x8_t b_j_vec = _simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length); // Again, upcast for subtraction float32x4_t a_j_vec_high = vcvt_f32_bf16(vget_high_bf16(a_j_vec)); @@ -397,7 +397,7 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_neon(simsimd_bf16_t const* a, simsi float32x4_t diff_j_vec_low = vsubq_f32(a_j_vec_low, b_j_vec_low); bfloat16x8_t diff_j_vec = vcombine_bf16(vcvt_bf16_f32(diff_j_vec_low), vcvt_bf16_f32(diff_j_vec_high)); - bfloat16x8_t c_vec = simsimd_partial_load_bf16x8_neon(c + i * n + tail_start, tail_length); + bfloat16x8_t c_vec = _simsimd_partial_load_bf16x8_neon(c + i * n + tail_start, tail_length); simsimd_f32_t partial_sum_high = vaddvq_f32(vbfmlaltq_f32(vdupq_n_f32(0), diff_j_vec, c_vec)); simsimd_f32_t partial_sum_low = vaddvq_f32(vbfmlalbq_f32(vdupq_n_f32(0), diff_j_vec, c_vec)); sum += diff_i * (partial_sum_high + partial_sum_low); @@ -434,15 +434,15 @@ SIMSIMD_PUBLIC void simsimd_bilinear_f16_haswell(simsimd_f16_t const* a, simsimd } // Handle the tail of every row - simsimd_f64_t sum = _mm256_reduce_add_ps_dbl(sum_vec); + simsimd_f64_t sum = _simsimd_reduce_f32x8_haswell(sum_vec); simsimd_size_t tail_length = n % 8; simsimd_size_t tail_start = n - tail_length; if (tail_length) { for (simsimd_size_t i = 0; i != n; ++i) { simsimd_f32_t a_i = _mm256_cvtss_f32(_mm256_cvtph_ps(_mm_set1_epi16(*(short const*)(a + i)))); - __m256 b_vec = simsimd_partial_load_f16x8_haswell(b + tail_start, tail_length); - __m256 c_vec = simsimd_partial_load_f16x8_haswell(c + i * n + tail_start, tail_length); - simsimd_f32_t partial_sum = _mm256_reduce_add_ps_dbl(_mm256_mul_ps(b_vec, c_vec)); + __m256 b_vec = _simsimd_partial_load_f16x8_haswell(b + tail_start, tail_length); + __m256 c_vec = _simsimd_partial_load_f16x8_haswell(c + i * n + tail_start, tail_length); + simsimd_f32_t partial_sum = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(b_vec, c_vec)); sum += a_i * partial_sum; } } @@ -470,7 +470,7 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_haswell(simsimd_f16_t const* a, sims } // Handle the tail of every row - simsimd_f64_t sum = _mm256_reduce_add_ps_dbl(sum_vec); + simsimd_f64_t sum = _simsimd_reduce_f32x8_haswell(sum_vec); simsimd_size_t tail_length = n % 8; simsimd_size_t tail_start = n - tail_length; if (tail_length) { @@ -479,10 +479,10 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_haswell(simsimd_f16_t const* a, sims _mm256_cvtph_ps(_mm_set1_epi16(*(short const*)(a + i))), // _mm256_cvtph_ps(_mm_set1_epi16(*(short const*)(b + i))))); __m256 diff_j_vec = _mm256_sub_ps( // - simsimd_partial_load_f16x8_haswell(a + tail_start, tail_length), - simsimd_partial_load_f16x8_haswell(b + tail_start, tail_length)); - __m256 c_vec = simsimd_partial_load_f16x8_haswell(c + i * n + tail_start, tail_length); - simsimd_f32_t partial_sum = _mm256_reduce_add_ps_dbl(_mm256_mul_ps(diff_j_vec, c_vec)); + _simsimd_partial_load_f16x8_haswell(a + tail_start, tail_length), + _simsimd_partial_load_f16x8_haswell(b + tail_start, tail_length)); + __m256 c_vec = _simsimd_partial_load_f16x8_haswell(c + i * n + tail_start, tail_length); + simsimd_f32_t partial_sum = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(diff_j_vec, c_vec)); sum += diff_i * partial_sum; } } @@ -495,29 +495,29 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_haswell(simsimd_bf16_t const* a, simsi simsimd_distance_t* result) { __m256 sum_vec = _mm256_setzero_ps(); for (simsimd_size_t i = 0; i != n; ++i) { - // The `simsimd_uncompress_bf16` is cheaper than `simsimd_bf16x8_to_f32x8_haswell` + // The `simsimd_uncompress_bf16` is cheaper than `_simsimd_bf16x8_to_f32x8_haswell` __m256 a_vec = _mm256_set1_ps(simsimd_uncompress_bf16(a + i)); __m256 partial_sum_vec = _mm256_setzero_ps(); for (simsimd_size_t j = 0; j + 8 <= n; j += 8) { - __m256 b_vec = simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j))); - __m256 c_vec = simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j))); + __m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j))); + __m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j))); partial_sum_vec = _mm256_fmadd_ps(b_vec, c_vec, partial_sum_vec); } sum_vec = _mm256_fmadd_ps(a_vec, partial_sum_vec, sum_vec); } // Handle the tail of every row - simsimd_f64_t sum = _mm256_reduce_add_ps_dbl(sum_vec); + simsimd_f64_t sum = _simsimd_reduce_f32x8_haswell(sum_vec); simsimd_size_t tail_length = n % 8; simsimd_size_t tail_start = n - tail_length; if (tail_length) { for (simsimd_size_t i = 0; i != n; ++i) { simsimd_f32_t a_i = simsimd_uncompress_bf16(a + i); - __m256 b_vec = simsimd_bf16x8_to_f32x8_haswell( // - simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length)); - __m256 c_vec = simsimd_bf16x8_to_f32x8_haswell( // - simsimd_partial_load_bf16x8_haswell(c + i * n + tail_start, tail_length)); - simsimd_f32_t partial_sum = _mm256_reduce_add_ps_dbl(_mm256_mul_ps(b_vec, c_vec)); + __m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell( // + _simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length)); + __m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell( // + _simsimd_partial_load_bf16x8_haswell(c + i * n + tail_start, tail_length)); + simsimd_f32_t partial_sum = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(b_vec, c_vec)); sum += a_i * partial_sum; } } @@ -535,28 +535,28 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_haswell(simsimd_bf16_t const* a, si _mm256_set1_ps(simsimd_uncompress_bf16(b + i))); __m256 partial_sum_vec = _mm256_setzero_ps(); for (simsimd_size_t j = 0; j + 8 <= n; j += 8) { - __m256 diff_j_vec = _mm256_sub_ps( // - simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(a + j))), // - simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j)))); - __m256 c_vec = simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j))); + __m256 diff_j_vec = _mm256_sub_ps( // + _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(a + j))), // + _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j)))); + __m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j))); partial_sum_vec = _mm256_fmadd_ps(diff_j_vec, c_vec, partial_sum_vec); } sum_vec = _mm256_fmadd_ps(diff_i_vec, partial_sum_vec, sum_vec); } // Handle the tail of every row - simsimd_f64_t sum = _mm256_reduce_add_ps_dbl(sum_vec); + simsimd_f64_t sum = _simsimd_reduce_f32x8_haswell(sum_vec); simsimd_size_t tail_length = n % 8; simsimd_size_t tail_start = n - tail_length; if (tail_length) { for (simsimd_size_t i = 0; i != n; ++i) { simsimd_f32_t diff_i = simsimd_uncompress_bf16(a + i) - simsimd_uncompress_bf16(b + i); __m256 diff_j_vec = _mm256_sub_ps( // - simsimd_bf16x8_to_f32x8_haswell(simsimd_partial_load_bf16x8_haswell(a + tail_start, tail_length)), - simsimd_bf16x8_to_f32x8_haswell(simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length))); - __m256 c_vec = simsimd_bf16x8_to_f32x8_haswell( - simsimd_partial_load_bf16x8_haswell(c + i * n + tail_start, tail_length)); - simsimd_f32_t partial_sum = _mm256_reduce_add_ps_dbl(_mm256_mul_ps(diff_j_vec, c_vec)); + _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(a + tail_start, tail_length)), + _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length))); + __m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell( + _simsimd_partial_load_bf16x8_haswell(c + i * n + tail_start, tail_length)); + simsimd_f32_t partial_sum = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(diff_j_vec, c_vec)); sum += diff_i * partial_sum; } } @@ -705,7 +705,7 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_genoa(simsimd_bf16_t const* a, sims b_j_vec = _mm512_maskz_loadu_epi16(tail_mask, b + tail_start); c_vec = _mm512_maskz_loadu_epi16(tail_mask, c + i * n + tail_start); } - diff_j_vec = simsimd_substract_bf16x32_genoa(a_j_vec, b_j_vec); + diff_j_vec = _simsimd_substract_bf16x32_genoa(a_j_vec, b_j_vec); partial_sum_vec = _mm512_dpbf16_ps(partial_sum_vec, (__m512bh)(diff_j_vec), (__m512bh)(c_vec)); j += 32; if (j < n) diff --git a/include/simsimd/dot.h b/include/simsimd/dot.h index 554378f2..0134f774 100644 --- a/include/simsimd/dot.h +++ b/include/simsimd/dot.h @@ -229,7 +229,7 @@ SIMSIMD_MAKE_COMPLEX_VDOT(accurate, bf16, f64, SIMSIMD_UNCOMPRESS_BF16) // simsi #pragma GCC target("arch=armv8.2-a+simd") #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function) -SIMSIMD_INTERNAL float32x4_t simsimd_partial_load_f32x4_neon(simsimd_f32_t const* a, simsimd_size_t n) { +SIMSIMD_INTERNAL float32x4_t _simsimd_partial_load_f32x4_neon(simsimd_f32_t const* a, simsimd_size_t n) { union { float32x4_t vec; simsimd_f32_t scalars[4]; @@ -373,7 +373,7 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_neon(simsimd_i8_t const* a, simsimd_i8_t cons #pragma GCC target("arch=armv8.2-a+simd+fp16") #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16"))), apply_to = function) -SIMSIMD_INTERNAL float16x4_t simsimd_partial_load_f16x4_neon(simsimd_f16_t const* a, simsimd_size_t n) { +SIMSIMD_INTERNAL float16x4_t _simsimd_partial_load_f16x4_neon(simsimd_f16_t const* a, simsimd_size_t n) { // In case the software emulation for `f16` scalars is enabled, the `simsimd_uncompress_f16` // function will run. It is extremely slow, so even for the tail, let's combine serial // loads and stores with vectorized math. @@ -397,8 +397,8 @@ SIMSIMD_PUBLIC void simsimd_dot_f16_neon(simsimd_f16_t const* a, simsimd_f16_t c simsimd_dot_f16_neon_cycle: if (n < 4) { - a_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(a, n)); - b_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(b, n)); + a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a, n)); + b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b, n)); n = 0; } else { a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const*)a)); @@ -490,7 +490,7 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_neon(simsimd_f16_t const* a, simsimd_f16_t #pragma GCC target("arch=armv8.6-a+simd+bf16") #pragma clang attribute push(__attribute__((target("arch=armv8.6-a+simd+bf16"))), apply_to = function) -SIMSIMD_INTERNAL bfloat16x8_t simsimd_partial_load_bf16x8_neon(simsimd_bf16_t const* a, simsimd_size_t n) { +SIMSIMD_INTERNAL bfloat16x8_t _simsimd_partial_load_bf16x8_neon(simsimd_bf16_t const* a, simsimd_size_t n) { union { bfloat16x8_t vec; simsimd_bf16_t scalars[8]; @@ -507,11 +507,11 @@ SIMSIMD_PUBLIC void simsimd_dot_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_ simsimd_distance_t* result) { float32x4_t ab_vec = vdupq_n_f32(0); bfloat16x8_t a_vec, b_vec; - + simsimd_dot_bf16_neon_cycle: if (n < 8) { - a_vec = simsimd_partial_load_bf16x8_neon(a, n); - b_vec = simsimd_partial_load_bf16x8_neon(b, n); + a_vec = _simsimd_partial_load_bf16x8_neon(a, n); + b_vec = _simsimd_partial_load_bf16x8_neon(b, n); n = 0; } else { a_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const*)a); @@ -806,7 +806,7 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_sve(simsimd_f16_t const* a, simsimd_f16_t #pragma GCC target("avx2", "f16c", "fma") #pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function) -SIMSIMD_INTERNAL simsimd_f64_t _mm256_reduce_add_ps_dbl(__m256 vec) { +SIMSIMD_INTERNAL simsimd_f64_t _simsimd_reduce_f32x8_haswell(__m256 vec) { // Convert the lower and higher 128-bit lanes of the input vector to double precision __m128 low_f32 = _mm256_castps256_ps128(vec); __m128 high_f32 = _mm256_extractf128_ps(vec, 1); @@ -841,7 +841,7 @@ SIMSIMD_PUBLIC void simsimd_dot_f32_haswell(simsimd_f32_t const* a, simsimd_f32_ __m256 b_vec = _mm256_loadu_ps(b + i); ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec); } - simsimd_f64_t ab = _mm256_reduce_add_ps_dbl(ab_vec); + simsimd_f64_t ab = _simsimd_reduce_f32x8_haswell(ab_vec); for (; i < n; ++i) ab += a[i] * b[i]; *results = ab; @@ -876,7 +876,7 @@ SIMSIMD_PUBLIC void simsimd_dot_f32c_haswell(simsimd_f32_t const* a, simsimd_f32 // To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit. // This way we can avoid the shuffling and the need for separate real and imaginary parts. // For the imaginary part of the product, we would need to swap the real and imaginary parts of - // one of the vectors. + // one of the vectors. Moreover, `XOR` can be placed after the primary loop. // Both operations are quite cheap, and the throughput doubles from 2.5 GB/s to 5 GB/s. __m256 ab_real_vec = _mm256_setzero_ps(); __m256 ab_imag_vec = _mm256_setzero_ps(); @@ -895,15 +895,17 @@ SIMSIMD_PUBLIC void simsimd_dot_f32c_haswell(simsimd_f32_t const* a, simsimd_f32 for (; i + 8 <= n; i += 8) { __m256 a_vec = _mm256_loadu_ps(a + i); __m256 b_vec = _mm256_loadu_ps(b + i); - __m256 b_flipped_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(b_vec), sign_flip_vec)); __m256 b_swapped_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec)); - ab_real_vec = _mm256_fmadd_ps(a_vec, b_flipped_vec, ab_real_vec); + ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec); ab_imag_vec = _mm256_fmadd_ps(a_vec, b_swapped_vec, ab_imag_vec); } + // Flip the sign bit in every second scalar before accumulation: + ab_real_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(ab_real_vec), sign_flip_vec)); + // Reduce horizontal sums: - simsimd_distance_t ab_real = _mm256_reduce_add_ps_dbl(ab_real_vec); - simsimd_distance_t ab_imag = _mm256_reduce_add_ps_dbl(ab_imag_vec); + simsimd_distance_t ab_real = _simsimd_reduce_f32x8_haswell(ab_real_vec); + simsimd_distance_t ab_imag = _simsimd_reduce_f32x8_haswell(ab_imag_vec); // Handle the tail: for (; i + 2 <= n; i += 2) { @@ -936,14 +938,16 @@ SIMSIMD_PUBLIC void simsimd_vdot_f32c_haswell(simsimd_f32_t const* a, simsimd_f3 __m256 a_vec = _mm256_loadu_ps(a + i); __m256 b_vec = _mm256_loadu_ps(b + i); ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec); - a_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(a_vec), sign_flip_vec)); b_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec)); ab_imag_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_imag_vec); } + // Flip the sign bit in every second scalar before accumulation: + ab_imag_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(ab_imag_vec), sign_flip_vec)); + // Reduce horizontal sums: - simsimd_distance_t ab_real = _mm256_reduce_add_ps_dbl(ab_real_vec); - simsimd_distance_t ab_imag = _mm256_reduce_add_ps_dbl(ab_imag_vec); + simsimd_distance_t ab_real = _simsimd_reduce_f32x8_haswell(ab_real_vec); + simsimd_distance_t ab_imag = _simsimd_reduce_f32x8_haswell(ab_imag_vec); // Handle the tail: for (; i + 2 <= n; i += 2) { @@ -955,7 +959,7 @@ SIMSIMD_PUBLIC void simsimd_vdot_f32c_haswell(simsimd_f32_t const* a, simsimd_f3 results[1] = ab_imag; } -SIMSIMD_INTERNAL __m256 simsimd_partial_load_f16x8_haswell(simsimd_f16_t const* a, simsimd_size_t n) { +SIMSIMD_INTERNAL __m256 _simsimd_partial_load_f16x8_haswell(simsimd_f16_t const* a, simsimd_size_t n) { // In case the software emulation for `f16` scalars is enabled, the `simsimd_uncompress_f16` // function will run. It is extremely slow, so even for the tail, let's combine serial // loads and stores with vectorized math. @@ -978,8 +982,8 @@ SIMSIMD_PUBLIC void simsimd_dot_f16_haswell(simsimd_f16_t const* a, simsimd_f16_ simsimd_dot_f16_haswell_cycle: if (n < 8) { - a_vec = simsimd_partial_load_f16x8_haswell(a, n); - b_vec = simsimd_partial_load_f16x8_haswell(b, n); + a_vec = _simsimd_partial_load_f16x8_haswell(a, n); + b_vec = _simsimd_partial_load_f16x8_haswell(b, n); n = 0; } else { a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a)); @@ -990,7 +994,7 @@ SIMSIMD_PUBLIC void simsimd_dot_f16_haswell(simsimd_f16_t const* a, simsimd_f16_ if (n) goto simsimd_dot_f16_haswell_cycle; - *result = _mm256_reduce_add_ps_dbl(ab_vec); + *result = _simsimd_reduce_f32x8_haswell(ab_vec); } SIMSIMD_PUBLIC void simsimd_dot_f16c_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, @@ -1019,18 +1023,20 @@ SIMSIMD_PUBLIC void simsimd_dot_f16c_haswell(simsimd_f16_t const* a, simsimd_f16 while (n >= 8) { __m256 a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a)); __m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b)); - __m256 b_flipped_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(b_vec), sign_flip_vec)); __m256 b_swapped_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec)); - ab_real_vec = _mm256_fmadd_ps(a_vec, b_flipped_vec, ab_real_vec); + ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec); ab_imag_vec = _mm256_fmadd_ps(a_vec, b_swapped_vec, ab_imag_vec); n -= 8, a += 8, b += 8; } + // Flip the sign bit in every second scalar before accumulation: + ab_real_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(ab_real_vec), sign_flip_vec)); + // Reduce horizontal sums and aggregate with the tail: simsimd_dot_f16c_serial(a, b, n, results); - results[0] += _mm256_reduce_add_ps_dbl(ab_real_vec); - results[1] += _mm256_reduce_add_ps_dbl(ab_imag_vec); + results[0] += _simsimd_reduce_f32x8_haswell(ab_real_vec); + results[1] += _simsimd_reduce_f32x8_haswell(ab_imag_vec); } SIMSIMD_PUBLIC void simsimd_vdot_f16c_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, @@ -1054,17 +1060,19 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_haswell(simsimd_f16_t const* a, simsimd_f1 __m256 a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a)); __m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b)); ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec); - a_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(a_vec), sign_flip_vec)); b_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec)); ab_imag_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_imag_vec); n -= 8, a += 8, b += 8; } + // Flip the sign bit in every second scalar before accumulation: + ab_imag_vec = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(ab_imag_vec), sign_flip_vec)); + // Reduce horizontal sums and aggregate with the tail: simsimd_dot_f16c_serial(a, b, n, results); - results[0] = _mm256_reduce_add_ps_dbl(ab_real_vec); - results[1] = _mm256_reduce_add_ps_dbl(ab_imag_vec); + results[0] += _simsimd_reduce_f32x8_haswell(ab_real_vec); + results[1] += _simsimd_reduce_f32x8_haswell(ab_imag_vec); } SIMSIMD_PUBLIC void simsimd_dot_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, @@ -1102,12 +1110,12 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t c *result = ab; } -SIMSIMD_INTERNAL __m256 simsimd_bf16x8_to_f32x8_haswell(__m128i a) { +SIMSIMD_INTERNAL __m256 _simsimd_bf16x8_to_f32x8_haswell(__m128i a) { // Upcasting from `bf16` to `f32` is done by shifting the `bf16` values by 16 bits to the left, like: return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(a), 16)); } -SIMSIMD_INTERNAL __m128i simsimd_partial_load_bf16x8_haswell(simsimd_bf16_t const* a, simsimd_size_t n) { +SIMSIMD_INTERNAL __m128i _simsimd_partial_load_bf16x8_haswell(simsimd_bf16_t const* a, simsimd_size_t n) { // In case the software emulation for `bf16` scalars is enabled, the `simsimd_uncompress_bf16` // function will run. It is extremely slow, so even for the tail, let's combine serial // loads and stores with vectorized math. @@ -1130,19 +1138,19 @@ SIMSIMD_PUBLIC void simsimd_dot_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf simsimd_dot_bf16_haswell_cycle: if (n < 8) { - a_vec = simsimd_partial_load_bf16x8_haswell(a, n); - b_vec = simsimd_partial_load_bf16x8_haswell(b, n); + a_vec = _simsimd_partial_load_bf16x8_haswell(a, n); + b_vec = _simsimd_partial_load_bf16x8_haswell(b, n); n = 0; } else { a_vec = _mm_loadu_si128((__m128i const*)a); b_vec = _mm_loadu_si128((__m128i const*)b); a += 8, b += 8, n -= 8; } - ab_vec = _mm256_fmadd_ps(simsimd_bf16x8_to_f32x8_haswell(a_vec), simsimd_bf16x8_to_f32x8_haswell(b_vec), ab_vec); + ab_vec = _mm256_fmadd_ps(_simsimd_bf16x8_to_f32x8_haswell(a_vec), _simsimd_bf16x8_to_f32x8_haswell(b_vec), ab_vec); if (n) goto simsimd_dot_bf16_haswell_cycle; - *result = _mm256_reduce_add_ps_dbl(ab_vec); + *result = _simsimd_reduce_f32x8_haswell(ab_vec); } #pragma clang attribute pop @@ -1154,6 +1162,13 @@ SIMSIMD_PUBLIC void simsimd_dot_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf #pragma GCC target("avx512f", "avx512vl", "avx512bw", "bmi2") #pragma clang attribute push(__attribute__((target("avx512f,avx512vl,avx512bw,bmi2"))), apply_to = function) +SIMSIMD_INTERNAL simsimd_f64_t _simsimd_reduce_f32x16_skylake(__m512 a) { + __m512 x = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(0, 0, 3, 2))); + __m128 r = _mm512_castps512_ps128(_mm512_add_ps(x, _mm512_shuffle_f32x4(x, x, _MM_SHUFFLE(0, 0, 0, 1)))); + r = _mm_hadd_ps(r, r); + return _mm_cvtss_f32(_mm_hadd_ps(r, r)); +} + SIMSIMD_PUBLIC void simsimd_dot_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result) { __m512 ab_vec = _mm512_setzero(); @@ -1174,7 +1189,7 @@ SIMSIMD_PUBLIC void simsimd_dot_f32_skylake(simsimd_f32_t const* a, simsimd_f32_ if (n) goto simsimd_dot_f32_skylake_cycle; - *result = _mm512_reduce_add_ps(ab_vec); + *result = _simsimd_reduce_f32x16_skylake(ab_vec); } SIMSIMD_PUBLIC void simsimd_dot_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, @@ -1202,11 +1217,10 @@ SIMSIMD_PUBLIC void simsimd_dot_f64_skylake(simsimd_f64_t const* a, simsimd_f64_ SIMSIMD_PUBLIC void simsimd_dot_f32c_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* results) { - __m512 ab_real_vec = _mm512_setzero(); __m512 ab_imag_vec = _mm512_setzero(); __m512 a_vec; - __m512i b_vec; + __m512 b_vec; // We take into account, that FMS is the same as FMA with a negative multiplier. // To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit. @@ -1224,27 +1238,29 @@ SIMSIMD_PUBLIC void simsimd_dot_f32c_skylake(simsimd_f32_t const* a, simsimd_f32 if (n < 16) { __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n); a_vec = _mm512_maskz_loadu_ps(mask, a); - b_vec = _mm512_castps_si512(_mm512_maskz_loadu_ps(mask, b)); + b_vec = _mm512_maskz_loadu_ps(mask, b); n = 0; } else { a_vec = _mm512_loadu_ps(a); - b_vec = _mm512_castps_si512(_mm512_loadu_ps(b)); + b_vec = _mm512_loadu_ps(b); a += 16, b += 16, n -= 16; } - ab_real_vec = _mm512_fmadd_ps(_mm512_castsi512_ps(_mm512_xor_si512(b_vec, sign_flip_vec)), a_vec, ab_real_vec); - ab_imag_vec = - _mm512_fmadd_ps(_mm512_castsi512_ps(_mm512_shuffle_epi8(b_vec, swap_adjacent_vec)), a_vec, ab_imag_vec); + ab_real_vec = _mm512_fmadd_ps(b_vec, a_vec, ab_real_vec); + ab_imag_vec = _mm512_fmadd_ps( + _mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(b_vec), swap_adjacent_vec)), a_vec, ab_imag_vec); if (n) goto simsimd_dot_f32c_skylake_cycle; + // Flip the sign bit in every second scalar before accumulation: + ab_real_vec = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(ab_real_vec), sign_flip_vec)); + // Reduce horizontal sums: - results[0] = _mm512_reduce_add_ps(ab_real_vec); - results[1] = _mm512_reduce_add_ps(ab_imag_vec); + results[0] = _simsimd_reduce_f32x16_skylake(ab_real_vec); + results[1] = _simsimd_reduce_f32x16_skylake(ab_imag_vec); } SIMSIMD_PUBLIC void simsimd_vdot_f32c_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* results) { - __m512 ab_real_vec = _mm512_setzero(); __m512 ab_imag_vec = _mm512_setzero(); __m512 a_vec; @@ -1274,24 +1290,25 @@ SIMSIMD_PUBLIC void simsimd_vdot_f32c_skylake(simsimd_f32_t const* a, simsimd_f3 a += 16, b += 16, n -= 16; } ab_real_vec = _mm512_fmadd_ps(a_vec, b_vec, ab_real_vec); - a_vec = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(a_vec), sign_flip_vec)); b_vec = _mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(b_vec), swap_adjacent_vec)); ab_imag_vec = _mm512_fmadd_ps(a_vec, b_vec, ab_imag_vec); if (n) goto simsimd_vdot_f32c_skylake_cycle; + // Flip the sign bit in every second scalar before accumulation: + ab_imag_vec = _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(ab_imag_vec), sign_flip_vec)); + // Reduce horizontal sums: - results[0] = _mm512_reduce_add_ps(ab_real_vec); - results[1] = _mm512_reduce_add_ps(ab_imag_vec); + results[0] = _simsimd_reduce_f32x16_skylake(ab_real_vec); + results[1] = _simsimd_reduce_f32x16_skylake(ab_imag_vec); } SIMSIMD_PUBLIC void simsimd_dot_f64c_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* results) { - __m512d ab_real_vec = _mm512_setzero_pd(); __m512d ab_imag_vec = _mm512_setzero_pd(); __m512d a_vec; - __m512i b_vec; + __m512d b_vec; // We take into account, that FMS is the same as FMA with a negative multiplier. // To multiply a floating-point value by -1, we can use the `XOR` instruction to flip the sign bit. @@ -1312,19 +1329,22 @@ SIMSIMD_PUBLIC void simsimd_dot_f64c_skylake(simsimd_f64_t const* a, simsimd_f64 if (n < 8) { __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n); a_vec = _mm512_maskz_loadu_pd(mask, a); - b_vec = _mm512_castpd_si512(_mm512_maskz_loadu_pd(mask, b)); + b_vec = _mm512_maskz_loadu_pd(mask, b); n = 0; } else { a_vec = _mm512_loadu_pd(a); - b_vec = _mm512_castpd_si512(_mm512_loadu_pd(b)); + b_vec = _mm512_loadu_pd(b); a += 8, b += 8, n -= 8; } - ab_real_vec = _mm512_fmadd_pd(_mm512_castsi512_pd(_mm512_xor_si512(b_vec, sign_flip_vec)), a_vec, ab_real_vec); - ab_imag_vec = - _mm512_fmadd_pd(_mm512_castsi512_pd(_mm512_shuffle_epi8(b_vec, swap_adjacent_vec)), a_vec, ab_imag_vec); + ab_real_vec = _mm512_fmadd_pd(b_vec, a_vec, ab_real_vec); + ab_imag_vec = _mm512_fmadd_pd( + _mm512_castsi512_pd(_mm512_shuffle_epi8(_mm512_castpd_si512(b_vec), swap_adjacent_vec)), a_vec, ab_imag_vec); if (n) goto simsimd_dot_f64c_skylake_cycle; + // Flip the sign bit in every second scalar before accumulation: + ab_real_vec = _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(ab_real_vec), sign_flip_vec)); + // Reduce horizontal sums: results[0] = _mm512_reduce_add_pd(ab_real_vec); results[1] = _mm512_reduce_add_pd(ab_imag_vec); @@ -1332,7 +1352,6 @@ SIMSIMD_PUBLIC void simsimd_dot_f64c_skylake(simsimd_f64_t const* a, simsimd_f64 SIMSIMD_PUBLIC void simsimd_vdot_f64c_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, simsimd_distance_t* results) { - __m512d ab_real_vec = _mm512_setzero_pd(); __m512d ab_imag_vec = _mm512_setzero_pd(); __m512d a_vec; @@ -1365,12 +1384,14 @@ SIMSIMD_PUBLIC void simsimd_vdot_f64c_skylake(simsimd_f64_t const* a, simsimd_f6 a += 8, b += 8, n -= 8; } ab_real_vec = _mm512_fmadd_pd(a_vec, b_vec, ab_real_vec); - a_vec = _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(a_vec), sign_flip_vec)); b_vec = _mm512_castsi512_pd(_mm512_shuffle_epi8(_mm512_castpd_si512(b_vec), swap_adjacent_vec)); ab_imag_vec = _mm512_fmadd_pd(a_vec, b_vec, ab_imag_vec); if (n) goto simsimd_vdot_f64c_skylake_cycle; + // Flip the sign bit in every second scalar before accumulation: + ab_imag_vec = _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(ab_imag_vec), sign_flip_vec)); + // Reduce horizontal sums: results[0] = _mm512_reduce_add_pd(ab_real_vec); results[1] = _mm512_reduce_add_pd(ab_imag_vec); @@ -1405,12 +1426,11 @@ SIMSIMD_PUBLIC void simsimd_dot_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16 if (n) goto simsimd_dot_bf16_genoa_cycle; - *result = _mm512_reduce_add_ps(ab_vec); + *result = _simsimd_reduce_f32x16_skylake(ab_vec); } SIMSIMD_PUBLIC void simsimd_dot_bf16c_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* results) { - __m512 ab_real_vec = _mm512_setzero_ps(); __m512 ab_imag_vec = _mm512_setzero_ps(); __m512i a_vec; @@ -1447,13 +1467,12 @@ SIMSIMD_PUBLIC void simsimd_dot_bf16c_genoa(simsimd_bf16_t const* a, simsimd_bf1 goto simsimd_dot_bf16c_genoa_cycle; // Reduce horizontal sums: - results[0] = _mm512_reduce_add_ps(ab_real_vec); - results[1] = _mm512_reduce_add_ps(ab_imag_vec); + results[0] = _simsimd_reduce_f32x16_skylake(ab_real_vec); + results[1] = _simsimd_reduce_f32x16_skylake(ab_imag_vec); } SIMSIMD_PUBLIC void simsimd_vdot_bf16c_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* results) { - __m512 ab_real_vec = _mm512_setzero_ps(); __m512 ab_imag_vec = _mm512_setzero_ps(); __m512i a_vec; @@ -1491,8 +1510,8 @@ SIMSIMD_PUBLIC void simsimd_vdot_bf16c_genoa(simsimd_bf16_t const* a, simsimd_bf goto simsimd_dot_bf16c_genoa_cycle; // Reduce horizontal sums: - results[0] = _mm512_reduce_add_ps(ab_real_vec); - results[1] = _mm512_reduce_add_ps(ab_imag_vec); + results[0] = _simsimd_reduce_f32x16_skylake(ab_real_vec); + results[1] = _simsimd_reduce_f32x16_skylake(ab_imag_vec); } #pragma clang attribute pop @@ -1529,7 +1548,6 @@ SIMSIMD_PUBLIC void simsimd_dot_f16_sapphire(simsimd_f16_t const* a, simsimd_f16 SIMSIMD_PUBLIC void simsimd_dot_f16c_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* results) { - __m512h ab_real_vec = _mm512_setzero_ph(); __m512h ab_imag_vec = _mm512_setzero_ph(); __m512i a_vec; @@ -1575,7 +1593,6 @@ SIMSIMD_PUBLIC void simsimd_dot_f16c_sapphire(simsimd_f16_t const* a, simsimd_f1 SIMSIMD_PUBLIC void simsimd_vdot_f16c_sapphire(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* results) { - __m512h ab_real_vec = _mm512_setzero_ph(); __m512h ab_imag_vec = _mm512_setzero_ph(); __m512i a_vec; diff --git a/include/simsimd/probability.h b/include/simsimd/probability.h index 2773e999..e39d81d1 100644 --- a/include/simsimd/probability.h +++ b/include/simsimd/probability.h @@ -138,7 +138,7 @@ SIMSIMD_MAKE_JS(accurate, bf16, f64, SIMSIMD_UNCOMPRESS_BF16, SIMSIMD_F32_DIVISI #pragma GCC target("arch=armv8.2-a+simd") #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function) -SIMSIMD_PUBLIC float32x4_t simsimd_log2_f32_neon(float32x4_t x) { +SIMSIMD_PUBLIC float32x4_t _simsimd_log2_f32_neon(float32x4_t x) { // Extracting the exponent int32x4_t i = vreinterpretq_s32_f32(x); int32x4_t e = vsubq_s32(vshrq_n_s32(vandq_s32(i, vdupq_n_s32(0x7F800000)), 23), vdupq_n_s32(127)); @@ -172,8 +172,8 @@ SIMSIMD_PUBLIC void simsimd_kl_f32_neon(simsimd_f32_t const* a, simsimd_f32_t co simsimd_kl_f32_neon_cycle: if (n < 4) { - a_vec = simsimd_partial_load_f32x4_neon(a, n); - b_vec = simsimd_partial_load_f32x4_neon(b, n); + a_vec = _simsimd_partial_load_f32x4_neon(a, n); + b_vec = _simsimd_partial_load_f32x4_neon(b, n); n = 0; } else { a_vec = vld1q_f32(a); @@ -182,7 +182,7 @@ SIMSIMD_PUBLIC void simsimd_kl_f32_neon(simsimd_f32_t const* a, simsimd_f32_t co } float32x4_t ratio_vec = vdivq_f32(vaddq_f32(a_vec, epsilon_vec), vaddq_f32(b_vec, epsilon_vec)); - float32x4_t log_ratio_vec = simsimd_log2_f32_neon(ratio_vec); + float32x4_t log_ratio_vec = _simsimd_log2_f32_neon(ratio_vec); float32x4_t prod_vec = vmulq_f32(a_vec, log_ratio_vec); sum_vec = vaddq_f32(sum_vec, prod_vec); if (n != 0) @@ -202,8 +202,8 @@ SIMSIMD_PUBLIC void simsimd_js_f32_neon(simsimd_f32_t const* a, simsimd_f32_t co simsimd_js_f32_neon_cycle: if (n < 4) { - a_vec = simsimd_partial_load_f32x4_neon(a, n); - b_vec = simsimd_partial_load_f32x4_neon(b, n); + a_vec = _simsimd_partial_load_f32x4_neon(a, n); + b_vec = _simsimd_partial_load_f32x4_neon(b, n); n = 0; } else { a_vec = vld1q_f32(a); @@ -214,8 +214,8 @@ SIMSIMD_PUBLIC void simsimd_js_f32_neon(simsimd_f32_t const* a, simsimd_f32_t co float32x4_t m_vec = vmulq_f32(vaddq_f32(a_vec, b_vec), vdupq_n_f32(0.5)); float32x4_t ratio_a_vec = vdivq_f32(vaddq_f32(a_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec)); float32x4_t ratio_b_vec = vdivq_f32(vaddq_f32(b_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec)); - float32x4_t log_ratio_a_vec = simsimd_log2_f32_neon(ratio_a_vec); - float32x4_t log_ratio_b_vec = simsimd_log2_f32_neon(ratio_b_vec); + float32x4_t log_ratio_a_vec = _simsimd_log2_f32_neon(ratio_a_vec); + float32x4_t log_ratio_b_vec = _simsimd_log2_f32_neon(ratio_b_vec); float32x4_t prod_a_vec = vmulq_f32(a_vec, log_ratio_a_vec); float32x4_t prod_b_vec = vmulq_f32(b_vec, log_ratio_b_vec); sum_vec = vaddq_f32(sum_vec, vaddq_f32(prod_a_vec, prod_b_vec)); @@ -245,8 +245,8 @@ SIMSIMD_PUBLIC void simsimd_kl_f16_neon(simsimd_f16_t const* a, simsimd_f16_t co simsimd_kl_f16_neon_cycle: if (n < 4) { - a_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(a, n)); - b_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(b, n)); + a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a, n)); + b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b, n)); n = 0; } else { a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const*)a)); @@ -255,7 +255,7 @@ SIMSIMD_PUBLIC void simsimd_kl_f16_neon(simsimd_f16_t const* a, simsimd_f16_t co } float32x4_t ratio_vec = vdivq_f32(vaddq_f32(a_vec, epsilon_vec), vaddq_f32(b_vec, epsilon_vec)); - float32x4_t log_ratio_vec = simsimd_log2_f32_neon(ratio_vec); + float32x4_t log_ratio_vec = _simsimd_log2_f32_neon(ratio_vec); float32x4_t prod_vec = vmulq_f32(a_vec, log_ratio_vec); sum_vec = vaddq_f32(sum_vec, prod_vec); if (n) @@ -275,8 +275,8 @@ SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const* a, simsimd_f16_t co simsimd_js_f16_neon_cycle: if (n < 4) { - a_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(a, n)); - b_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(b, n)); + a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a, n)); + b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b, n)); n = 0; } else { a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const*)a)); @@ -287,8 +287,8 @@ SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const* a, simsimd_f16_t co float32x4_t m_vec = vmulq_f32(vaddq_f32(a_vec, b_vec), vdupq_n_f32(0.5)); float32x4_t ratio_a_vec = vdivq_f32(vaddq_f32(a_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec)); float32x4_t ratio_b_vec = vdivq_f32(vaddq_f32(b_vec, epsilon_vec), vaddq_f32(m_vec, epsilon_vec)); - float32x4_t log_ratio_a_vec = simsimd_log2_f32_neon(ratio_a_vec); - float32x4_t log_ratio_b_vec = simsimd_log2_f32_neon(ratio_b_vec); + float32x4_t log_ratio_a_vec = _simsimd_log2_f32_neon(ratio_a_vec); + float32x4_t log_ratio_b_vec = _simsimd_log2_f32_neon(ratio_b_vec); float32x4_t prod_a_vec = vmulq_f32(a_vec, log_ratio_a_vec); float32x4_t prod_b_vec = vmulq_f32(b_vec, log_ratio_b_vec); sum_vec = vaddq_f32(sum_vec, vaddq_f32(prod_a_vec, prod_b_vec)); @@ -311,7 +311,7 @@ SIMSIMD_PUBLIC void simsimd_js_f16_neon(simsimd_f16_t const* a, simsimd_f16_t co #pragma GCC target("avx2", "f16c", "fma") #pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function) -SIMSIMD_INTERNAL __m256 simsimd_log2_f32_haswell(__m256 x) { +SIMSIMD_INTERNAL __m256 _simsimd_log2_f32_haswell(__m256 x) { // Extracting the exponent __m256i i = _mm256_castps_si256(x); __m256i e = _mm256_srli_epi32(_mm256_and_si256(i, _mm256_set1_epi32(0x7F800000)), 23); @@ -347,8 +347,8 @@ SIMSIMD_PUBLIC void simsimd_kl_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t simsimd_kl_f16_haswell_cycle: if (n < 8) { - a_vec = simsimd_partial_load_f16x8_haswell(a, n); - b_vec = simsimd_partial_load_f16x8_haswell(b, n); + a_vec = _simsimd_partial_load_f16x8_haswell(a, n); + b_vec = _simsimd_partial_load_f16x8_haswell(b, n); n = 0; } else { a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a)); @@ -357,14 +357,14 @@ SIMSIMD_PUBLIC void simsimd_kl_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t } __m256 ratio_vec = _mm256_div_ps(_mm256_add_ps(a_vec, epsilon_vec), _mm256_add_ps(b_vec, epsilon_vec)); - __m256 log_ratio_vec = simsimd_log2_f32_haswell(ratio_vec); + __m256 log_ratio_vec = _simsimd_log2_f32_haswell(ratio_vec); __m256 prod_vec = _mm256_mul_ps(a_vec, log_ratio_vec); sum_vec = _mm256_add_ps(sum_vec, prod_vec); if (n) goto simsimd_kl_f16_haswell_cycle; simsimd_f32_t log2_normalizer = 0.693147181f; - simsimd_f32_t sum = _mm256_reduce_add_ps_dbl(sum_vec); + simsimd_f32_t sum = _simsimd_reduce_f32x8_haswell(sum_vec); sum *= log2_normalizer; *result = sum; } @@ -378,8 +378,8 @@ SIMSIMD_PUBLIC void simsimd_js_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t simsimd_js_f16_haswell_cycle: if (n < 8) { - a_vec = simsimd_partial_load_f16x8_haswell(a, n); - b_vec = simsimd_partial_load_f16x8_haswell(b, n); + a_vec = _simsimd_partial_load_f16x8_haswell(a, n); + b_vec = _simsimd_partial_load_f16x8_haswell(b, n); n = 0; } else { a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a)); @@ -390,8 +390,8 @@ SIMSIMD_PUBLIC void simsimd_js_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t __m256 m_vec = _mm256_mul_ps(_mm256_add_ps(a_vec, b_vec), _mm256_set1_ps(0.5f)); // M = (P + Q) / 2 __m256 ratio_a_vec = _mm256_div_ps(a_vec, m_vec); __m256 ratio_b_vec = _mm256_div_ps(b_vec, m_vec); - __m256 log_ratio_a_vec = simsimd_log2_f32_haswell(ratio_a_vec); - __m256 log_ratio_b_vec = simsimd_log2_f32_haswell(ratio_b_vec); + __m256 log_ratio_a_vec = _simsimd_log2_f32_haswell(ratio_a_vec); + __m256 log_ratio_b_vec = _simsimd_log2_f32_haswell(ratio_b_vec); __m256 prod_a_vec = _mm256_mul_ps(a_vec, log_ratio_a_vec); __m256 prod_b_vec = _mm256_mul_ps(b_vec, log_ratio_b_vec); sum_vec = _mm256_add_ps(sum_vec, prod_a_vec); @@ -400,7 +400,7 @@ SIMSIMD_PUBLIC void simsimd_js_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t goto simsimd_js_f16_haswell_cycle; simsimd_f32_t log2_normalizer = 0.693147181f; - simsimd_f32_t sum = _mm256_reduce_add_ps_dbl(sum_vec); + simsimd_f32_t sum = _simsimd_reduce_f32x8_haswell(sum_vec); sum *= log2_normalizer; *result = sum / 2; } @@ -414,7 +414,7 @@ SIMSIMD_PUBLIC void simsimd_js_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t #pragma GCC target("avx512f", "avx512vl", "bmi2") #pragma clang attribute push(__attribute__((target("avx512f,avx512vl,bmi2"))), apply_to = function) -SIMSIMD_INTERNAL __m512 simsimd_log2_f32_skylake(__m512 x) { +SIMSIMD_INTERNAL __m512 _simsimd_log2_f32_skylake(__m512 x) { // Extract the exponent and mantissa __m512 one = _mm512_set1_ps(1.0f); __m512 e = _mm512_getexp_ps(x); @@ -450,7 +450,7 @@ SIMSIMD_PUBLIC void simsimd_kl_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t a += 16, b += 16, n -= 16; } __m512 ratio_vec = _mm512_div_ps(a_vec, b_vec); - __m512 log_ratio_vec = simsimd_log2_f32_skylake(ratio_vec); + __m512 log_ratio_vec = _simsimd_log2_f32_skylake(ratio_vec); __m512 prod_vec = _mm512_mul_ps(a_vec, log_ratio_vec); sum_vec = _mm512_add_ps(sum_vec, prod_vec); if (n) @@ -486,8 +486,8 @@ SIMSIMD_PUBLIC void simsimd_js_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t __m512 m_recip_approx = _mm512_rcp14_ps(m_vec); __m512 ratio_a_vec = _mm512_mul_ps(a_vec, m_recip_approx); __m512 ratio_b_vec = _mm512_mul_ps(b_vec, m_recip_approx); - __m512 log_ratio_a_vec = simsimd_log2_f32_skylake(ratio_a_vec); - __m512 log_ratio_b_vec = simsimd_log2_f32_skylake(ratio_b_vec); + __m512 log_ratio_a_vec = _simsimd_log2_f32_skylake(ratio_a_vec); + __m512 log_ratio_b_vec = _simsimd_log2_f32_skylake(ratio_b_vec); sum_a_vec = _mm512_maskz_fmadd_ps(nonzero_mask, a_vec, log_ratio_a_vec, sum_a_vec); sum_b_vec = _mm512_maskz_fmadd_ps(nonzero_mask, b_vec, log_ratio_b_vec, sum_b_vec); if (n) @@ -506,7 +506,7 @@ SIMSIMD_PUBLIC void simsimd_js_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t #pragma GCC target("avx512f", "avx512vl", "bmi2", "avx512fp16") #pragma clang attribute push(__attribute__((target("avx512f,avx512vl,bmi2,avx512fp16"))), apply_to = function) -SIMSIMD_INTERNAL __m512h simsimd_log2_f16_sapphire(__m512h x) { +SIMSIMD_INTERNAL __m512h _simsimd_log2_f16_sapphire(__m512h x) { // Extract the exponent and mantissa __m512h one = _mm512_set1_ph((simsimd_f16_t)1); __m512h e = _mm512_getexp_ph(x); @@ -541,7 +541,7 @@ SIMSIMD_PUBLIC void simsimd_kl_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_ a += 32, b += 32, n -= 32; } __m512h ratio_vec = _mm512_div_ph(a_vec, b_vec); - __m512h log_ratio_vec = simsimd_log2_f16_sapphire(ratio_vec); + __m512h log_ratio_vec = _simsimd_log2_f16_sapphire(ratio_vec); __m512h prod_vec = _mm512_mul_ph(a_vec, log_ratio_vec); sum_vec = _mm512_add_ph(sum_vec, prod_vec); if (n) @@ -576,8 +576,8 @@ SIMSIMD_PUBLIC void simsimd_js_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_ __m512h m_recip_approx = _mm512_rcp_ph(m_vec); __m512h ratio_a_vec = _mm512_mul_ph(a_vec, m_recip_approx); __m512h ratio_b_vec = _mm512_mul_ph(b_vec, m_recip_approx); - __m512h log_ratio_a_vec = simsimd_log2_f16_sapphire(ratio_a_vec); - __m512h log_ratio_b_vec = simsimd_log2_f16_sapphire(ratio_b_vec); + __m512h log_ratio_a_vec = _simsimd_log2_f16_sapphire(ratio_a_vec); + __m512h log_ratio_b_vec = _simsimd_log2_f16_sapphire(ratio_b_vec); sum_a_vec = _mm512_maskz_fmadd_ph(nonzero_mask, a_vec, log_ratio_a_vec, sum_a_vec); sum_b_vec = _mm512_maskz_fmadd_ph(nonzero_mask, b_vec, log_ratio_b_vec, sum_b_vec); if (n) diff --git a/include/simsimd/spatial.h b/include/simsimd/spatial.h index 4e814c57..c2512426 100644 --- a/include/simsimd/spatial.h +++ b/include/simsimd/spatial.h @@ -27,7 +27,7 @@ #include "types.h" -#include "dot.h" // `_mm256_reduce_add_ps_dbl` +#include "dot.h" // `_simsimd_reduce_f32x8_haswell` #ifdef __cplusplus extern "C" { @@ -189,8 +189,8 @@ SIMSIMD_MAKE_COS(accurate, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_cos_i8_accu #pragma GCC target("arch=armv8.2-a+simd") #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function) -SIMSIMD_INTERNAL simsimd_distance_t simsimd_cos_normalize_f64_neon(simsimd_f64_t ab, simsimd_f64_t a2, - simsimd_f64_t b2) { +SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f64_neon(simsimd_f64_t ab, simsimd_f64_t a2, + simsimd_f64_t b2) { if (a2 == 0 && b2 == 0) return 0; if (ab == 0) @@ -242,7 +242,7 @@ SIMSIMD_PUBLIC void simsimd_cos_f32_neon(simsimd_f32_t const* a, simsimd_f32_t c ab += ai * bi, a2 += ai * ai, b2 += bi * bi; } - *result = simsimd_cos_normalize_f64_neon(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_neon(ab, a2, b2); } #pragma clang attribute pop @@ -261,8 +261,8 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f16_neon(simsimd_f16_t const* a, simsimd_f16_t simsimd_l2sq_f16_neon_cycle: if (n < 4) { - a_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(a, n)); - b_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(b, n)); + a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a, n)); + b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b, n)); n = 0; } else { a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const*)a)); @@ -284,8 +284,8 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_neon(simsimd_f16_t const* a, simsimd_f16_t c simsimd_cos_f16_neon_cycle: if (n < 4) { - a_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(a, n)); - b_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(b, n)); + a_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a, n)); + b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b, n)); n = 0; } else { a_vec = vcvt_f32_f16(vld1_f16((simsimd_f16_for_arm_simd_t const*)a)); @@ -299,7 +299,7 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_neon(simsimd_f16_t const* a, simsimd_f16_t c goto simsimd_cos_f16_neon_cycle; simsimd_f32_t ab = vaddvq_f32(ab_vec), a2 = vaddvq_f32(a2_vec), b2 = vaddvq_f32(b2_vec); - *result = simsimd_cos_normalize_f64_neon(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_neon(ab, a2, b2); } #pragma clang attribute pop @@ -349,8 +349,8 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_ simsimd_cos_bf16_neon_cycle: if (n < 8) { - a_vec = simsimd_partial_load_bf16x8_neon(a, n); - b_vec = simsimd_partial_load_bf16x8_neon(b, n); + a_vec = _simsimd_partial_load_bf16x8_neon(a, n); + b_vec = _simsimd_partial_load_bf16x8_neon(b, n); n = 0; } else { a_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const*)a); @@ -370,7 +370,7 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_ simsimd_f32_t ab = vaddvq_f32(vaddq_f32(ab_high_vec, ab_low_vec)), a2 = vaddvq_f32(vaddq_f32(a2_high_vec, a2_low_vec)), b2 = vaddvq_f32(vaddq_f32(b2_high_vec, b2_low_vec)); - *result = simsimd_cos_normalize_f64_neon(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_neon(ab, a2, b2); } SIMSIMD_PUBLIC void simsimd_l2sq_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, @@ -380,8 +380,8 @@ SIMSIMD_PUBLIC void simsimd_l2sq_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16 simsimd_l2sq_bf16_neon_cycle: if (n < 8) { - bfloat16x8_t a_vec = simsimd_partial_load_bf16x8_neon(a, n); - bfloat16x8_t b_vec = simsimd_partial_load_bf16x8_neon(b, n); + bfloat16x8_t a_vec = _simsimd_partial_load_bf16x8_neon(a, n); + bfloat16x8_t b_vec = _simsimd_partial_load_bf16x8_neon(b, n); diff_high_vec = vsubq_f32(vcvt_f32_bf16(vget_high_bf16(a_vec)), vcvt_f32_bf16(vget_high_bf16(b_vec))); diff_low_vec = vsubq_f32(vcvt_f32_bf16(vget_low_bf16(a_vec)), vcvt_f32_bf16(vget_low_bf16(b_vec))); n = 0; @@ -547,7 +547,7 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const* a, simsimd_i8_t cons ab += ai * bi, a2 += ai * ai, b2 += bi * bi; } - *result = simsimd_cos_normalize_f64_neon(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_neon(ab, a2, b2); } #pragma clang attribute pop @@ -594,7 +594,7 @@ SIMSIMD_PUBLIC void simsimd_cos_f32_sve(simsimd_f32_t const* a, simsimd_f32_t co simsimd_f32_t ab = svaddv_f32(svptrue_b32(), ab_vec); simsimd_f32_t a2 = svaddv_f32(svptrue_b32(), a2_vec); simsimd_f32_t b2 = svaddv_f32(svptrue_b32(), b2_vec); - *result = simsimd_cos_normalize_f64_neon(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_neon(ab, a2, b2); } SIMSIMD_PUBLIC void simsimd_l2sq_f64_sve(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, @@ -632,7 +632,7 @@ SIMSIMD_PUBLIC void simsimd_cos_f64_sve(simsimd_f64_t const* a, simsimd_f64_t co simsimd_f64_t ab = svaddv_f64(svptrue_b32(), ab_vec); simsimd_f64_t a2 = svaddv_f64(svptrue_b32(), a2_vec); simsimd_f64_t b2 = svaddv_f64(svptrue_b32(), b2_vec); - *result = simsimd_cos_normalize_f64_neon(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_neon(ab, a2, b2); } #pragma clang attribute pop @@ -681,7 +681,7 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_sve(simsimd_f16_t const* a_enum, simsimd_f16 simsimd_f16_for_arm_simd_t ab = svaddv_f16(svptrue_b16(), ab_vec); simsimd_f16_for_arm_simd_t a2 = svaddv_f16(svptrue_b16(), a2_vec); simsimd_f16_for_arm_simd_t b2 = svaddv_f16(svptrue_b16(), b2_vec); - *result = simsimd_cos_normalize_f64_neon(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_neon(ab, a2, b2); } #pragma clang attribute pop @@ -702,8 +702,8 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f16_haswell(simsimd_f16_t const* a, simsimd_f16 simsimd_l2sq_f16_haswell_cycle: if (n < 8) { - a_vec = simsimd_partial_load_f16x8_haswell(a, n); - b_vec = simsimd_partial_load_f16x8_haswell(b, n); + a_vec = _simsimd_partial_load_f16x8_haswell(a, n); + b_vec = _simsimd_partial_load_f16x8_haswell(b, n); n = 0; } else { a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a)); @@ -715,11 +715,11 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f16_haswell(simsimd_f16_t const* a, simsimd_f16 if (n) goto simsimd_l2sq_f16_haswell_cycle; - *result = _mm256_reduce_add_ps_dbl(d2_vec); + *result = _simsimd_reduce_f32x8_haswell(d2_vec); } -SIMSIMD_INTERNAL simsimd_distance_t simsimd_cos_normalize_f64_haswell(simsimd_f64_t ab, simsimd_f64_t a2, - simsimd_f64_t b2) { +SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f64_haswell(simsimd_f64_t ab, simsimd_f64_t a2, + simsimd_f64_t b2) { // If both vectors have magnitude 0, the distance is 0. if (a2 == 0 && b2 == 0) @@ -732,7 +732,7 @@ SIMSIMD_INTERNAL simsimd_distance_t simsimd_cos_normalize_f64_haswell(simsimd_f6 // https://web.archive.org/web/20210208132927/http://assemblyrequired.crashworks.org/timing-square-root/ // The latency of the native instruction is 4 cycles and it's broadly supported. // For single-precision floats it has a maximum relative error of 1.5*2^-12. - // Higher precision isn't implemented on older CPUs. See `simsimd_cos_normalize_f64_skylake` for that. + // Higher precision isn't implemented on older CPUs. See `_simsimd_cos_normalize_f64_skylake` for that. __m128d rsqrts = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(_mm_set_pd(a2, b2)))); simsimd_f64_t a2_reciprocal = _mm_cvtsd_f64(_mm_unpackhi_pd(rsqrts, rsqrts)); simsimd_f64_t b2_reciprocal = _mm_cvtsd_f64(rsqrts); @@ -748,8 +748,8 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_haswell(simsimd_f16_t const* a, simsimd_f16_ simsimd_cos_f16_haswell_cycle: if (n < 8) { - a_vec = simsimd_partial_load_f16x8_haswell(a, n); - b_vec = simsimd_partial_load_f16x8_haswell(b, n); + a_vec = _simsimd_partial_load_f16x8_haswell(a, n); + b_vec = _simsimd_partial_load_f16x8_haswell(b, n); n = 0; } else { a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a)); @@ -762,10 +762,10 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_haswell(simsimd_f16_t const* a, simsimd_f16_ if (n) goto simsimd_cos_f16_haswell_cycle; - simsimd_f64_t ab = _mm256_reduce_add_ps_dbl(ab_vec); - simsimd_f64_t a2 = _mm256_reduce_add_ps_dbl(a2_vec); - simsimd_f64_t b2 = _mm256_reduce_add_ps_dbl(b2_vec); - *result = simsimd_cos_normalize_f64_haswell(ab, a2, b2); + simsimd_f64_t ab = _simsimd_reduce_f32x8_haswell(ab_vec); + simsimd_f64_t a2 = _simsimd_reduce_f32x8_haswell(a2_vec); + simsimd_f64_t b2 = _simsimd_reduce_f32x8_haswell(b2_vec); + *result = _simsimd_cos_normalize_f64_haswell(ab, a2, b2); } SIMSIMD_PUBLIC void simsimd_l2sq_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, @@ -775,12 +775,12 @@ SIMSIMD_PUBLIC void simsimd_l2sq_bf16_haswell(simsimd_bf16_t const* a, simsimd_b simsimd_l2sq_bf16_haswell_cycle: if (n < 8) { - a_vec = simsimd_bf16x8_to_f32x8_haswell(simsimd_partial_load_bf16x8_haswell(a, n)); - b_vec = simsimd_bf16x8_to_f32x8_haswell(simsimd_partial_load_bf16x8_haswell(b, n)); + a_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(a, n)); + b_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b, n)); n = 0; } else { - a_vec = simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)a)); - b_vec = simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)b)); + a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)a)); + b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)b)); n -= 8, a += 8, b += 8; } __m256 d_vec = _mm256_sub_ps(a_vec, b_vec); @@ -788,7 +788,7 @@ SIMSIMD_PUBLIC void simsimd_l2sq_bf16_haswell(simsimd_bf16_t const* a, simsimd_b if (n) goto simsimd_l2sq_bf16_haswell_cycle; - *result = _mm256_reduce_add_ps_dbl(d2_vec); + *result = _simsimd_reduce_f32x8_haswell(d2_vec); } SIMSIMD_PUBLIC void simsimd_cos_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, @@ -798,12 +798,12 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf simsimd_cos_bf16_haswell_cycle: if (n < 8) { - a_vec = simsimd_bf16x8_to_f32x8_haswell(simsimd_partial_load_bf16x8_haswell(a, n)); - b_vec = simsimd_bf16x8_to_f32x8_haswell(simsimd_partial_load_bf16x8_haswell(b, n)); + a_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(a, n)); + b_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b, n)); n = 0; } else { - a_vec = simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)a)); - b_vec = simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)b)); + a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)a)); + b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)b)); n -= 8, a += 8, b += 8; } ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec); @@ -812,10 +812,10 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf if (n) goto simsimd_cos_bf16_haswell_cycle; - simsimd_f64_t ab = _mm256_reduce_add_ps_dbl(ab_vec); - simsimd_f64_t a2 = _mm256_reduce_add_ps_dbl(a2_vec); - simsimd_f64_t b2 = _mm256_reduce_add_ps_dbl(b2_vec); - *result = simsimd_cos_normalize_f64_haswell(ab, a2, b2); + simsimd_f64_t ab = _simsimd_reduce_f32x8_haswell(ab_vec); + simsimd_f64_t a2 = _simsimd_reduce_f32x8_haswell(a2_vec); + simsimd_f64_t b2 = _simsimd_reduce_f32x8_haswell(b2_vec); + *result = _simsimd_cos_normalize_f64_haswell(ab, a2, b2); } SIMSIMD_PUBLIC void simsimd_l2sq_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, @@ -919,7 +919,7 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t c ab += ai * bi, a2 += ai * ai, b2 += bi * bi; } - *result = simsimd_cos_normalize_f64_haswell(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_haswell(ab, a2, b2); } SIMSIMD_PUBLIC void simsimd_l2sq_f32_haswell(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, @@ -934,7 +934,7 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f32_haswell(simsimd_f32_t const* a, simsimd_f32 d2_vec = _mm256_fmadd_ps(d_vec, d_vec, d2_vec); } - simsimd_f64_t d2 = _mm256_reduce_add_ps_dbl(d2_vec); + simsimd_f64_t d2 = _simsimd_reduce_f32x8_haswell(d2_vec); for (; i < n; ++i) { float d = a[i] - b[i]; d2 += d * d; @@ -958,14 +958,14 @@ SIMSIMD_PUBLIC void simsimd_cos_f32_haswell(simsimd_f32_t const* a, simsimd_f32_ b2_vec = _mm256_fmadd_ps(b_vec, b_vec, b2_vec); } - simsimd_f64_t ab = _mm256_reduce_add_ps_dbl(ab_vec); - simsimd_f64_t a2 = _mm256_reduce_add_ps_dbl(a2_vec); - simsimd_f64_t b2 = _mm256_reduce_add_ps_dbl(b2_vec); + simsimd_f64_t ab = _simsimd_reduce_f32x8_haswell(ab_vec); + simsimd_f64_t a2 = _simsimd_reduce_f32x8_haswell(a2_vec); + simsimd_f64_t b2 = _simsimd_reduce_f32x8_haswell(b2_vec); for (; i < n; ++i) { float ai = a[i], bi = b[i]; ab += ai * bi, a2 += ai * ai, b2 += bi * bi; } - *result = simsimd_cos_normalize_f64_haswell(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_haswell(ab, a2, b2); } #pragma clang attribute pop @@ -974,8 +974,8 @@ SIMSIMD_PUBLIC void simsimd_cos_f32_haswell(simsimd_f32_t const* a, simsimd_f32_ #if SIMSIMD_TARGET_SKYLAKE #pragma GCC push_options -#pragma GCC target("avx512f", "avx512vl", "bmi2") -#pragma clang attribute push(__attribute__((target("avx512f,avx512vl,bmi2"))), apply_to = function) +#pragma GCC target("avx512f", "avx512bw", "avx512vl", "bmi2") +#pragma clang attribute push(__attribute__((target("avx512f,avx512bw,avx512vl,bmi2"))), apply_to = function) SIMSIMD_PUBLIC void simsimd_l2sq_f32_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result) { @@ -998,11 +998,11 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f32_skylake(simsimd_f32_t const* a, simsimd_f32 if (n) goto simsimd_l2sq_f32_skylake_cycle; - *result = _mm512_reduce_add_ps(d2_vec); + *result = _simsimd_reduce_f32x16_skylake(d2_vec); } -SIMSIMD_INTERNAL simsimd_distance_t simsimd_cos_normalize_f64_skylake(simsimd_f64_t ab, simsimd_f64_t a2, - simsimd_f64_t b2) { +SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f64_skylake(simsimd_f64_t ab, simsimd_f64_t a2, + simsimd_f64_t b2) { // If both vectors have magnitude 0, the distance is 0. if (a2 == 0 && b2 == 0) @@ -1014,7 +1014,7 @@ SIMSIMD_INTERNAL simsimd_distance_t simsimd_cos_normalize_f64_skylake(simsimd_f6 // We want to avoid the `simsimd_approximate_inverse_square_root` due to high latency: // https://web.archive.org/web/20210208132927/http://assemblyrequired.crashworks.org/timing-square-root/ // The maximum relative error for this approximation is less than 2^-14, which is 6x lower than - // for single-precision floats in the `simsimd_cos_normalize_f64_haswell` implementation. + // for single-precision floats in the `_simsimd_cos_normalize_f64_haswell` implementation. // Mysteriously, MSVC has no `_mm_rsqrt14_pd` intrinsic, but has it's masked variants, // so let's use `_mm_maskz_rsqrt14_pd(0xFF, ...)` instead. __m128d rsqrts = _mm_maskz_rsqrt14_pd(0xFF, _mm_set_pd(a2, b2)); @@ -1047,10 +1047,10 @@ SIMSIMD_PUBLIC void simsimd_cos_f32_skylake(simsimd_f32_t const* a, simsimd_f32_ if (n) goto simsimd_cos_f32_skylake_cycle; - simsimd_f64_t ab = _mm512_reduce_add_ps(ab_vec); - simsimd_f64_t a2 = _mm512_reduce_add_ps(a2_vec); - simsimd_f64_t b2 = _mm512_reduce_add_ps(b2_vec); - *result = simsimd_cos_normalize_f64_skylake(ab, a2, b2); + simsimd_f64_t ab = _simsimd_reduce_f32x16_skylake(ab_vec); + simsimd_f64_t a2 = _simsimd_reduce_f32x16_skylake(a2_vec); + simsimd_f64_t b2 = _simsimd_reduce_f32x16_skylake(b2_vec); + *result = _simsimd_cos_normalize_f64_skylake(ab, a2, b2); } SIMSIMD_PUBLIC void simsimd_l2sq_f64_skylake(simsimd_f64_t const* a, simsimd_f64_t const* b, simsimd_size_t n, @@ -1104,7 +1104,7 @@ SIMSIMD_PUBLIC void simsimd_cos_f64_skylake(simsimd_f64_t const* a, simsimd_f64_ simsimd_f64_t ab = _mm512_reduce_add_pd(ab_vec); simsimd_f64_t a2 = _mm512_reduce_add_pd(a2_vec); simsimd_f64_t b2 = _mm512_reduce_add_pd(b2_vec); - *result = simsimd_cos_normalize_f64_skylake(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_skylake(ab, a2, b2); } #pragma clang attribute pop @@ -1116,7 +1116,7 @@ SIMSIMD_PUBLIC void simsimd_cos_f64_skylake(simsimd_f64_t const* a, simsimd_f64_ #pragma GCC target("avx512f", "avx512vl", "bmi2", "avx512bw", "avx512bf16") #pragma clang attribute push(__attribute__((target("avx512f,avx512vl,bmi2,avx512bw,avx512bf16"))), apply_to = function) -SIMSIMD_INTERNAL __m512i simsimd_substract_bf16x32_genoa(__m512i a_i16, __m512i b_i16) { +SIMSIMD_INTERNAL __m512i _simsimd_substract_bf16x32_genoa(__m512i a_i16, __m512i b_i16) { union { __m512 fvec; @@ -1184,12 +1184,12 @@ SIMSIMD_PUBLIC void simsimd_l2sq_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf1 b_i16_vec = _mm512_loadu_epi16(b); a += 32, b += 32, n -= 32; } - d_i16_vec = simsimd_substract_bf16x32_genoa(a_i16_vec, b_i16_vec); + d_i16_vec = _simsimd_substract_bf16x32_genoa(a_i16_vec, b_i16_vec); d2_vec = _mm512_dpbf16_ps(d2_vec, (__m512bh)(d_i16_vec), (__m512bh)(d_i16_vec)); if (n) goto simsimd_l2sq_bf16_genoa_cycle; - *result = _mm512_reduce_add_ps(d2_vec); + *result = _simsimd_reduce_f32x16_skylake(d2_vec); } SIMSIMD_PUBLIC void simsimd_cos_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, @@ -1216,10 +1216,10 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16 if (n) goto simsimd_cos_bf16_genoa_cycle; - simsimd_f64_t ab = _mm512_reduce_add_ps(ab_vec); - simsimd_f64_t a2 = _mm512_reduce_add_ps(a2_vec); - simsimd_f64_t b2 = _mm512_reduce_add_ps(b2_vec); - *result = simsimd_cos_normalize_f64_skylake(ab, a2, b2); + simsimd_f64_t ab = _simsimd_reduce_f32x16_skylake(ab_vec); + simsimd_f64_t a2 = _simsimd_reduce_f32x16_skylake(a2_vec); + simsimd_f64_t b2 = _simsimd_reduce_f32x16_skylake(b2_vec); + *result = _simsimd_cos_normalize_f64_skylake(ab, a2, b2); } #pragma clang attribute pop @@ -1282,7 +1282,7 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_sapphire(simsimd_f16_t const* a, simsimd_f16 simsimd_f64_t ab = _mm512_reduce_add_ph(ab_vec); simsimd_f64_t a2 = _mm512_reduce_add_ph(a2_vec); simsimd_f64_t b2 = _mm512_reduce_add_ph(b2_vec); - *result = simsimd_cos_normalize_f64_skylake(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_skylake(ab, a2, b2); } #pragma clang attribute pop @@ -1365,7 +1365,7 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const int ab = _mm512_reduce_add_epi32(_mm512_add_epi32(ab_low_i32s_vec, ab_high_i32s_vec)); int a2 = _mm512_reduce_add_epi32(a2_i32s_vec); int b2 = _mm512_reduce_add_epi32(b2_i32s_vec); - *result = simsimd_cos_normalize_f64_skylake(ab, a2, b2); + *result = _simsimd_cos_normalize_f64_skylake(ab, a2, b2); } #pragma clang attribute pop diff --git a/pyproject.toml b/pyproject.toml index 3781f82a..f6aed989 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ filterwarnings = ["error"] [tool.cibuildwheel] build-verbosity = 0 -test-requires = ["pytest", "pytest-repeat"] +test-requires = ["pytest", "pytest-repeat", "tabulate"] # Assuming NumPy and SciPy aren't available precompiled for some platforms, # we will end up compiling them from source on emulated hardware... diff --git a/python/test.py b/python/test.py index f9aa94dd..17b7315c 100644 --- a/python/test.py +++ b/python/test.py @@ -1,6 +1,51 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Module: test.py + +This module contains a suite of tests for the `simsimd` package. +It compares various SIMD kernels (like Dot-products, squared Euclidean, and Cosine distances) +with their NumPy or baseline counterparts, testing accuracy for different data types including +floating-point, integer, and complex numbers. + +The tests cover: + +- **Dense Vector Operations**: Tests for `float64`, `float32`, `float16` data types using metrics like `inner`, `sqeuclidean`, and `cosine`. +- **Brain Floating-Point Format (bfloat16)**: Tests for operations with the brain floating-point format not natively supported by NumPy. +- **Integer Operations**: Tests for `int8` data type, ensuring accuracy without overflow. +- **Bitwise Operations**: Tests for Hamming and Jaccard distances using bit arrays. +- **Complex Numbers**: Tests for complex dot products and vector dot products. +- **Batch Operations and Cross-Distance Computations**: Tests for batch processing and cross-distance computations using `cdist`. +- **Hardware Capabilities Verification**: Checks the availability of hardware capabilities and function pointers. + +**Dependencies**: + +- Python 3.x +- `numpy` +- `scipy` +- `pytest` +- `tabulate` +- `simsimd` package + +**Usage**: + +Run the tests using pytest: + + pytest test.py + +Or run the script directly: + + python test.py + +""" + import os +import math +import time import platform +import collections +import tabulate import pytest import simsimd as simd @@ -27,7 +72,7 @@ baseline_inner = np.inner baseline_sqeuclidean = spd.sqeuclidean baseline_cosine = spd.cosine - baseline_jensenshannon = spd.jensenshannon + baseline_jensenshannon = lambda x, y: spd.jensenshannon(x, y) ** 2 baseline_hamming = lambda x, y: spd.hamming(x, y) * len(x) baseline_jaccard = spd.jaccard baseline_intersect = lambda x, y: len(np.intersect1d(x, y)) @@ -48,7 +93,7 @@ def baseline_mahalanobis(x, y, z): baseline_inner = lambda x, y: np.inner(x, y) baseline_cosine = lambda x, y: 1.0 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) baseline_sqeuclidean = lambda x, y: np.sum((x - y) ** 2) - baseline_jensenshannon = lambda p, q: np.sqrt((np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / 2) + baseline_jensenshannon = lambda p, q: (np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / 2 baseline_hamming = lambda x, y: np.logical_xor(x, y).sum() baseline_bilinear = lambda x, y, z: x @ z @ y @@ -79,12 +124,222 @@ def is_running_under_qemu(): return "SIMSIMD_IN_QEMU" in os.environ +def profile(callable, *args, **kwargs) -> tuple: + before = time.perf_counter_ns() + result = callable(*args, **kwargs) + after = time.perf_counter_ns() + return after - before, result + + +@pytest.fixture(scope="session") +def stats_fixture(): + """Session-scoped fixture that collects errors during tests.""" + results = dict() + results["metric"] = [] + results["ndim"] = [] + results["dtype"] = [] + results["absolute_baseline_error"] = [] + results["relative_baseline_error"] = [] + results["absolute_simsimd_error"] = [] + results["relative_simsimd_error"] = [] + results["accurate_duration"] = [] + results["baseline_duration"] = [] + results["simsimd_duration"] = [] + yield results + + # Group the errors by (metric, ndim, dtype) to calculate the mean and std error. + grouped_errors = collections.defaultdict( + lambda: { + "absolute_baseline_error": [], + "relative_baseline_error": [], + "absolute_simsimd_error": [], + "relative_simsimd_error": [], + "accurate_duration": [], + "baseline_duration": [], + "simsimd_duration": [], + } + ) + for ( + metric, + ndim, + dtype, + absolute_baseline_error, + relative_baseline_error, + absolute_simsimd_error, + relative_simsimd_error, + accurate_duration, + baseline_duration, + simsimd_duration, + ) in zip( + results["metric"], + results["ndim"], + results["dtype"], + results["absolute_baseline_error"], + results["relative_baseline_error"], + results["absolute_simsimd_error"], + results["relative_simsimd_error"], + results["accurate_duration"], + results["baseline_duration"], + results["simsimd_duration"], + ): + key = (metric, ndim, dtype) + grouped_errors[key]["absolute_baseline_error"].append(absolute_baseline_error) + grouped_errors[key]["relative_baseline_error"].append(relative_baseline_error) + grouped_errors[key]["absolute_simsimd_error"].append(absolute_simsimd_error) + grouped_errors[key]["relative_simsimd_error"].append(relative_simsimd_error) + grouped_errors[key]["accurate_duration"].append(accurate_duration) + grouped_errors[key]["baseline_duration"].append(baseline_duration) + grouped_errors[key]["simsimd_duration"].append(simsimd_duration) + + # Compute mean and the standard deviation for each task error + final_results = [] + for key, errors in grouped_errors.items(): + n = len(errors["simsimd_duration"]) + + # Mean and the standard deviation for errors + baseline_errors = errors["relative_baseline_error"] + simsimd_errors = errors["relative_simsimd_error"] + baseline_mean = sum(baseline_errors) / n + simsimd_mean = sum(simsimd_errors) / n + baseline_std = math.sqrt(sum((x - baseline_mean) ** 2 for x in baseline_errors) / n) + simsimd_std = math.sqrt(sum((x - simsimd_mean) ** 2 for x in simsimd_errors) / n) + baseline_error_formatted = f"{baseline_mean:.2e} ± {baseline_std:.2e}" + simsimd_error_formatted = f"{simsimd_mean:.2e} ± {simsimd_std:.2e}" + + # Log durations + accurate_durations = errors["accurate_duration"] + baseline_durations = errors["baseline_duration"] + simsimd_durations = errors["simsimd_duration"] + accurate_mean_duration = sum(accurate_durations) / n + baseline_mean_duration = sum(baseline_durations) / n + simsimd_mean_duration = sum(simsimd_durations) / n + accurate_std_duration = math.sqrt(sum((x - accurate_mean_duration) ** 2 for x in accurate_durations) / n) + baseline_std_duration = math.sqrt(sum((x - baseline_mean_duration) ** 2 for x in baseline_durations) / n) + simsimd_std_duration = math.sqrt(sum((x - simsimd_mean_duration) ** 2 for x in simsimd_durations) / n) + accurate_duration = f"{accurate_mean_duration:.2e} ± {accurate_std_duration:.2e}" + baseline_duration = f"{baseline_mean_duration:.2e} ± {baseline_std_duration:.2e}" + simsimd_duration = f"{simsimd_mean_duration:.2e} ± {simsimd_std_duration:.2e}" + + # Measure time improvement + improvements = [baseline / simsimd for baseline, simsimd in zip(baseline_durations, simsimd_durations)] + improvements_mean = sum(improvements) / n + improvements_std = math.sqrt(sum((x - improvements_mean) ** 2 for x in improvements) / n) + simsimd_speedup = f"{improvements_mean:.2f}x ± {improvements_std:.2f}x" + + # Calculate Improvement + # improvement = abs(baseline_mean - simsimd_mean) / min(simsimd_mean, baseline_mean) + # if baseline_mean < simsimd_mean: + # improvement *= -1 + # improvement_formatted = f"{improvement:+.2}x" if improvement != float("inf") else "N/A" + + final_results.append( + ( + *key, + baseline_error_formatted, + simsimd_error_formatted, + accurate_duration, + baseline_duration, + simsimd_duration, + simsimd_speedup, + ) + ) + + # Sort results for consistent presentation + final_results.sort(key=lambda x: (x[0], x[1], x[2])) + + # Output the final table after all tests are completed + print("\n") + print("Numerical Error Aggregation Report:") + headers = [ + "Metric", + "NDim", + "DType", + "Baseline Error", # Printed as mean ± std deviation + "SimSIMD Error", # Printed as mean ± std deviation + "Accurate Duration", # Printed as mean ± std deviation + "Baseline Duration", # Printed as mean ± std deviation + "SimSIMD Duration", # Printed as mean ± std deviation + "SimSIMD Speedup", + ] + print(tabulate.tabulate(final_results, headers=headers, tablefmt="pretty", showindex=True)) + + +@pytest.hookimpl(tryfirst=True) +def pytest_runtest_makereport(item, call): + """Custom hook to ensure that the error aggregator runs even for failed tests.""" + if call.when == "call": + item.test_result = call.excinfo is None + + +def collect_errors( + metric: str, + ndim: int, + dtype: str, + accurate_result: float, + accurate_duration: float, + baseline_result: float, + baseline_duration: float, + simsimd_result: float, + simsimd_duration: float, + stats, +): + """Calculates and aggregates errors for a given test. + + What we want to know in the end of the day is: + + - How much SimSIMD implementation is more/less accurate than baseline, + when compared against the accurate result? + - TODO: How much faster is SimSIMD than the baseline kernel? + - TODO: How much faster is SimSIMD than the accurate kernel? + """ + absolute_baseline_error = np.abs(baseline_result - accurate_result) + relative_baseline_error = absolute_baseline_error / np.abs(accurate_result) + absolute_simsimd_error = np.abs(simsimd_result - accurate_result) + relative_simsimd_error = absolute_simsimd_error / np.abs(accurate_result) + + stats["metric"].append(metric) + stats["ndim"].append(ndim) + stats["dtype"].append(dtype) + stats["absolute_baseline_error"].append(absolute_baseline_error) + stats["relative_baseline_error"].append(relative_baseline_error) + stats["absolute_simsimd_error"].append(absolute_simsimd_error) + stats["relative_simsimd_error"].append(relative_simsimd_error) + stats["accurate_duration"].append(accurate_duration) + stats["baseline_duration"].append(baseline_duration) + stats["simsimd_duration"].append(simsimd_duration) + + # For normalized distances we use the absolute tolerance, because the result is close to zero. # For unnormalized ones (like squared Euclidean or Jaccard), we use the relative. SIMSIMD_RTOL = 0.1 SIMSIMD_ATOL = 0.1 +def name_to_kernels(name: str): + """ + Having a separate "helper" function to convert the kernel name is handy for PyTest decorators, + that can't generally print non-trivial object (like function pointers) well. + """ + if name == "inner": + return baseline_inner, simd.inner + elif name == "sqeuclidean": + return baseline_sqeuclidean, simd.sqeuclidean + elif name == "cosine": + return baseline_cosine, simd.cosine + elif name == "bilinear": + return baseline_bilinear, simd.bilinear + elif name == "mahalanobis": + return baseline_mahalanobis, simd.mahalanobis + elif name == "jaccard": + return baseline_jaccard, simd.jaccard + elif name == "hamming": + return baseline_hamming, simd.hamming + elif name == "intersect": + return baseline_intersect, simd.intersect + else: + raise ValueError(f"Unknown kernel name: {name}") + + def f32_round_and_downcast_to_bf16(array): """Converts an array of 32-bit floats into 16-bit brain-floats.""" array = np.asarray(array, dtype=np.float32) @@ -160,15 +415,8 @@ def test_capabilities_list(): @pytest.mark.repeat(50) @pytest.mark.parametrize("ndim", [11, 97, 1536]) @pytest.mark.parametrize("dtype", ["float64", "float32", "float16"]) -@pytest.mark.parametrize( - "kernels", - [ - (baseline_inner, simd.inner), - (baseline_sqeuclidean, simd.sqeuclidean), - (baseline_cosine, simd.cosine), - ], -) -def test_dense(ndim, dtype, kernels): +@pytest.mark.parametrize("metric", ["inner", "sqeuclidean", "cosine"]) +def test_dense(ndim, dtype, metric, stats_fixture): """Compares various SIMD kernels (like Dot-products, squared Euclidean, and Cosine distances) with their NumPy or baseline counterparts, testing accuracy for IEEE standard floating-point types.""" @@ -179,16 +427,19 @@ def test_dense(ndim, dtype, kernels): a = np.random.randn(ndim).astype(dtype) b = np.random.randn(ndim).astype(dtype) - baseline_kernel, simd_kernel = kernels - expected = baseline_kernel(a, b).astype(np.float64) - result = simd_kernel(a, b) + baseline_kernel, simd_kernel = name_to_kernels(metric) - np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL) + accurate_dt, accurate = profile(baseline_kernel, a.astype(np.float64), b.astype(np.float64)) + expected_dt, expected = profile(baseline_kernel, a, b) + result_dt, result = profile(simd_kernel, a, b) + + np.testing.assert_allclose(result, expected.astype(np.float64), atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL) + collect_errors(metric, ndim, dtype, accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture) @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed") @pytest.mark.repeat(50) -@pytest.mark.parametrize("ndim", [11, 16, 33]) +@pytest.mark.parametrize("ndim", [11, 97]) @pytest.mark.parametrize( "dtypes", # representation datatype and compute precision [ @@ -197,14 +448,8 @@ def test_dense(ndim, dtype, kernels): ("float16", "float32"), # otherwise NumPy keeps aggregating too much error ], ) -@pytest.mark.parametrize( - "kernels", # baseline kernel and the contender from SimSIMD - [ - (baseline_bilinear, simd.bilinear), - (baseline_mahalanobis, simd.mahalanobis), - ], -) -def test_curved(ndim, dtypes, kernels): +@pytest.mark.parametrize("metric", ["bilinear", "mahalanobis"]) +def test_curved(ndim, dtypes, metric, stats_fixture): """Compares various SIMD kernels (like Bilinear Forms and Mahalanobis distances) for curved spaces with their NumPy or baseline counterparts, testing accuracy for IEEE standard floating-point types.""" @@ -226,29 +471,30 @@ def test_curved(ndim, dtypes, kernels): c = np.abs(np.random.randn(ndim, ndim).astype(dtype)) c = np.dot(c, c.T) - baseline_kernel, simd_kernel = kernels - expected = baseline_kernel( + baseline_kernel, simd_kernel = name_to_kernels(metric) + accurate_dt, accurate = profile( + baseline_kernel, + a.astype(np.float64), + b.astype(np.float64), + c.astype(np.float64), + ) + expected_dt, expected = profile( + baseline_kernel, a.astype(compute_dtype), b.astype(compute_dtype), c.astype(compute_dtype), - ).astype(np.float64) - result = simd_kernel(a, b, c) + ) + result_dt, result = profile(simd_kernel, a, b, c) np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL) + collect_errors(metric, ndim, dtype, accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture) @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed") @pytest.mark.repeat(50) -@pytest.mark.parametrize("ndim", [4, 8, 12]) -@pytest.mark.parametrize( - "kernels", - [ - (baseline_inner, simd.inner), - (baseline_sqeuclidean, simd.sqeuclidean), - (baseline_cosine, simd.cosine), - ], -) -def test_dense_bf16(ndim, kernels): +@pytest.mark.parametrize("ndim", [11, 97, 1536]) +@pytest.mark.parametrize("metric", ["inner", "sqeuclidean", "cosine"]) +def test_dense_bf16(ndim, metric, stats_fixture): """Compares various SIMD kernels (like Dot-products, squared Euclidean, and Cosine distances) with their NumPy or baseline counterparts, testing accuracy for the Brain-float format not natively supported by NumPy.""" @@ -259,9 +505,10 @@ def test_dense_bf16(ndim, kernels): a_f32_rounded, a_bf16 = f32_round_and_downcast_to_bf16(a) b_f32_rounded, b_bf16 = f32_round_and_downcast_to_bf16(b) - baseline_kernel, simd_kernel = kernels - expected = baseline_kernel(a_f32_rounded, b_f32_rounded).astype(np.float64) - result = simd_kernel(a_bf16, b_bf16, "bf16") + baseline_kernel, simd_kernel = name_to_kernels(metric) + accurate_dt, accurate = profile(baseline_kernel, a_f32_rounded.astype(np.float64), b_f32_rounded.astype(np.float64)) + expected_dt, expected = profile(baseline_kernel, a_f32_rounded, b_f32_rounded) + result_dt, result = profile(simd_kernel, a_bf16, b_bf16, "bf16") np.testing.assert_allclose( result, @@ -275,19 +522,16 @@ def test_dense_bf16(ndim, kernels): Second `bf16` operand in hex: {hex_array(b_bf16)} """, ) + collect_errors( + metric, ndim, "bfloat16", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture + ) @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed") @pytest.mark.repeat(50) @pytest.mark.parametrize("ndim", [11, 16, 33]) -@pytest.mark.parametrize( - "kernels", # baseline kernel and the contender from SimSIMD - [ - (baseline_bilinear, simd.bilinear), - (baseline_mahalanobis, simd.mahalanobis), - ], -) -def test_curved_bf16(ndim, kernels): +@pytest.mark.parametrize("metric", ["bilinear", "mahalanobis"]) +def test_curved_bf16(ndim, metric, stats_fixture): """Compares various SIMD kernels (like Bilinear Forms and Mahalanobis distances) for curved spaces with their NumPy or baseline counterparts, testing accuracy for the Brain-float format not natively supported by NumPy.""" @@ -310,9 +554,15 @@ def test_curved_bf16(ndim, kernels): b_f32_rounded, b_bf16 = f32_round_and_downcast_to_bf16(b) c_f32_rounded, c_bf16 = f32_round_and_downcast_to_bf16(c) - baseline_kernel, simd_kernel = kernels - expected = baseline_kernel(a_f32_rounded, b_f32_rounded, c_f32_rounded).astype(np.float64) - result = simd_kernel(a_bf16, b_bf16, c_bf16, "bf16") + baseline_kernel, simd_kernel = name_to_kernels(metric) + accurate_dt, accurate = profile( + baseline_kernel, + a_f32_rounded.astype(np.float64), + b_f32_rounded.astype(np.float64), + c_f32_rounded.astype(np.float64), + ) + expected_dt, expected = profile(baseline_kernel, a_f32_rounded, b_f32_rounded, c_f32_rounded) + result_dt, result = profile(simd_kernel, a_bf16, b_bf16, c_bf16, "bf16") np.testing.assert_allclose( result, @@ -328,20 +578,16 @@ def test_curved_bf16(ndim, kernels): Matrix `bf16` operand in hex: {hex_array(c_bf16)} """, ) + collect_errors( + metric, ndim, "bfloat16", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture + ) @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed") @pytest.mark.repeat(50) @pytest.mark.parametrize("ndim", [11, 97, 1536]) -@pytest.mark.parametrize( - "kernels", - [ - (baseline_inner, simd.inner), - (baseline_sqeuclidean, simd.sqeuclidean), - (baseline_cosine, simd.cosine), - ], -) -def test_dense_i8(ndim, kernels): +@pytest.mark.parametrize("metric", ["inner", "sqeuclidean", "cosine"]) +def test_dense_i8(ndim, metric, stats_fixture): """Compares various SIMD kernels (like Dot-products, squared Euclidean, and Cosine distances) with their NumPy or baseline counterparts, testing accuracy for small integer types, that can't be directly processed with other tools without overflowing.""" @@ -350,7 +596,7 @@ def test_dense_i8(ndim, kernels): a = np.random.randint(-128, 127, size=(ndim), dtype=np.int8) b = np.random.randint(-128, 127, size=(ndim), dtype=np.int8) - baseline_kernel, simd_kernel = kernels + baseline_kernel, simd_kernel = name_to_kernels(metric) # Fun fact: SciPy doesn't actually raise an `OverflowError` when overflow happens # here, instead it raises `ValueError: math domain error` during the `sqrt` operation. @@ -360,29 +606,33 @@ def test_dense_i8(ndim, kernels): expected_overflow = OverflowError() except ValueError: expected_overflow = ValueError() - expected = baseline_kernel(a.astype(np.float64), b.astype(np.float64)) - result = simd_kernel(a, b) + accurate_dt, accurate = profile(baseline_kernel, a.astype(np.float64), b.astype(np.float64)) + expected_dt, expected = profile(baseline_kernel, a.astype(np.int64), b.astype(np.int64)) + result_dt, result = profile(simd_kernel, a, b) assert int(result) == int(expected), f"Expected {expected}, but got {result} (overflow: {expected_overflow})" + collect_errors(metric, ndim, "int8", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture) @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed") @pytest.mark.skipif(not scipy_available, reason="SciPy is not installed") @pytest.mark.repeat(50) @pytest.mark.parametrize("ndim", [11, 97, 1536]) -@pytest.mark.parametrize("kernels", [(baseline_hamming, simd.hamming), (baseline_jaccard, simd.jaccard)]) -def test_dense_bits(ndim, kernels): +@pytest.mark.parametrize("metric", ["jaccard", "hamming"]) +def test_dense_bits(ndim, metric, stats_fixture): """Compares various SIMD kernels (like Hamming and Jaccard/Tanimoto distances) for dense bit arrays with their NumPy or baseline counterparts, even though, they can't process sub-byte-sized scalars.""" np.random.seed() a = np.random.randint(2, size=ndim).astype(np.uint8) b = np.random.randint(2, size=ndim).astype(np.uint8) - baseline_kernel, simd_kernel = kernels - expected = baseline_kernel(a, b) - result = simd_kernel(np.packbits(a), np.packbits(b), "b8") + baseline_kernel, simd_kernel = name_to_kernels(metric) + accurate_dt, accurate = profile(baseline_kernel, a.astype(np.uint64), b.astype(np.uint64)) + expected_dt, expected = profile(baseline_kernel, a, b) + result_dt, result = profile(simd_kernel, np.packbits(a), np.packbits(b), "b8") np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL) + collect_errors(metric, ndim, "bits", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture) @pytest.mark.skip(reason="Problems inferring the tolerance bounds for numerical errors") @@ -397,10 +647,15 @@ def test_jensen_shannon(ndim, dtype): a /= np.sum(a) b /= np.sum(b) - expected = baseline_jensenshannon(a, b) ** 2 - result = simd.jensenshannon(a, b) + baseline_kernel, simd_kernel = name_to_kernels("jensenshannon") + accurate_dt, accurate = profile(baseline_kernel, a.astype(np.float64), b.astype(np.float64)) + expected_dt, expected = profile(baseline_kernel, a, b) + result_dt, result = profile(simd_kernel, a, b) np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL) + collect_errors( + "jensenshannon", ndim, dtype, accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture + ) @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed") @@ -421,42 +676,35 @@ def test_cosine_zero_vector(ndim, dtype): assert abs(result) < SIMSIMD_ATOL, f"Expected 0 distance from itself, but got {result}" -@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed") -@pytest.mark.parametrize("ndim", [11, 97, 1536]) -@pytest.mark.parametrize("dtype", ["float64", "float32", "float16"]) -def test_cosine_tolerance(ndim, dtype): - """Tests the simd.cosine() function analyzing its `rsqrt` approximation error.""" - a = np.random.randn(ndim).astype(dtype) - b = np.random.randn(ndim).astype(dtype) - - expected_f64 = baseline_cosine(a.astype(np.float64), b.astype(np.float64)) - result_f64 = simd.cosine(a, b) - expected = np.array(expected_f64, dtype=dtype) - result = np.array(result_f64, dtype=dtype) - assert np.allclose(expected, result, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL) - - @pytest.mark.skipif(is_running_under_qemu(), reason="Complex math in QEMU fails") @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed") @pytest.mark.repeat(50) @pytest.mark.parametrize("ndim", [22, 66, 1536]) @pytest.mark.parametrize("dtype", ["float32", "float64"]) -def test_dot_complex(ndim, dtype): +def test_dot_complex(ndim, dtype, stats_fixture): """Compares the simd.dot() and simd.vdot() against NumPy for complex numbers.""" np.random.seed() dtype_view = np.complex64 if dtype == "float32" else np.complex128 a = np.random.randn(ndim).astype(dtype=dtype).view(dtype_view) b = np.random.randn(ndim).astype(dtype=dtype).view(dtype_view) - expected = np.dot(a, b) - result = simd.dot(a, b) + accurate_dt, accurate = profile(np.dot, a.astype(np.complex128), b.astype(np.complex128)) + expected_dt, expected = profile(np.dot, a, b) + result_dt, result = profile(simd.dot, a, b) np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL) + collect_errors( + "dot", ndim, dtype + "c", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture + ) - expected = np.vdot(a, b) - result = simd.vdot(a, b) + accurate_dt, accurate = profile(np.vdot, a.astype(np.complex128), b.astype(np.complex128)) + expected_dt, expected = profile(np.vdot, a, b) + result_dt, result = profile(simd.vdot, a, b) np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL) + collect_errors( + "vdot", ndim, dtype + "c", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture + ) @pytest.mark.skipif(is_running_under_qemu(), reason="Complex math in QEMU fails") @@ -613,4 +861,11 @@ def test_cdist_hamming(ndim, out_dtype): if __name__ == "__main__": - pytest.main() + pytest.main( + [ + "-s", # Print stdout + "-x", # Stop on first failure + "-v", # Verbose output + "--tb=short", # Short traceback format + ] + )