Skip to content

Commit

Permalink
Merge pull request #182 from ashvardanian/main-dev
Browse files Browse the repository at this point in the history
Improved Accuracy & Testing Utility
  • Loading branch information
ashvardanian committed Sep 13, 2024
2 parents 0159003 + 8075cb3 commit 20f4caa
Show file tree
Hide file tree
Showing 9 changed files with 596 additions and 311 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/prerelease.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --no-cache-dir --upgrade pip
pip install --no-cache-dir pytest numpy scipy py-cpuinfo pytest-repeat
pip install --no-cache-dir py-cpuinfo pytest pytest-repeat numpy scipy tabulate
python -c "from cpuinfo import get_cpu_info; print(get_cpu_info())"
- name: Build locally on Ubuntu
Expand Down
6 changes: 3 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ cmake --build build_release --config Release
Testing:

```sh
pip install -e . # to install the package in editable mode
pip install pytest pytest-repeat # testing dependencies
pytest python/test.py -s -x -Wd # to run tests
pip install -e . # to install the package in editable mode
pip install pytest pytest-repeat tabulate # testing dependencies
pytest python/test.py -s -x -Wd # to run tests

# to check supported SIMD instructions:
python -c "import simsimd; print(simsimd.get_capabilities())"
Expand Down
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ SimSIMD provides an alternative.

## Features

__SimSIMD__ provides __over 200 SIMD-optimized kernels__ for various distance and similarity measures, accelerating search in [USearch](https://github.com/unum-cloud/usearch) and several DBMS products.
__SimSIMD__ (Arabic: "سيمسيم دي") is a library of __over 200 SIMD-optimized kernels__ for distance and similarity measures, boosting search performance in [USearch](https://github.com/unum-cloud/usearch) and several database systems.
Named after the iconic ["Open Sesame"](https://en.wikipedia.org/wiki/Open_sesame) command from _Ali Baba and the Forty Thieves_, it opens the doors to a modern treasure: maximizing the potential of today's hardware for high resource utilization.
Implemented distance functions include:

- Euclidean (L2) and Cosine (Angular) spatial distances for Vector Search.
Expand Down Expand Up @@ -240,6 +241,18 @@ By default, the output distances will be stored in double-precision `f64` floati
That behavior may not be space-efficient, especially if you are computing the hamming distance between short binary vectors, that will generally fit into 8x smaller `u8` or `u16` types.
To override this behavior, use the `dtype` argument.

### Helper Functions

You can turn specific backends on or off depending on the exact environment.
A common case may be avoiding AVX-512 on older AMD CPUs and [Intel Ice Lake](https://travisdowns.github.io/blog/2020/08/19/icl-avx512-freq.html) CPUs to ensure the CPU doesn't change the frequency license and throttle performance.

```py
$ simsimd.get_capabilities()
> {'serial': True, 'neon': False, 'sve': False, 'neon_f16': False, 'sve_f16': False, 'neon_bf16': False, 'sve_bf16': False, 'neon_i8': False, 'sve_i8': False, 'haswell': True, 'skylake': True, 'ice': True, 'genoa': True, 'sapphire': True}
$ simsimd.disable_capability("sapphire")
$ simsimd.enable_capability("sapphire")
```

### Using Python API with USearch

Want to use it in Python with [USearch](https://github.com/unum-cloud/usearch)?
Expand Down
88 changes: 44 additions & 44 deletions include/simsimd/curved.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@

#include "types.h"

#include "dot.h" // `simsimd_partial_load_f16x4_neon` and friends
#include "spatial.h" // `simsimd_substract_bf16x32_genoa`
#include "dot.h" // `_simsimd_partial_load_f16x4_neon` and friends
#include "spatial.h" // `_simsimd_substract_bf16x32_genoa`

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -250,9 +250,9 @@ SIMSIMD_PUBLIC void simsimd_bilinear_f16_neon(simsimd_f16_t const* a, simsimd_f1
simsimd_size_t tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = vaddvq_f32(vcvt_f32_f16(simsimd_partial_load_f16x4_neon(a + i, 1)));
float32x4_t b_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(b + tail_start, tail_length));
float32x4_t c_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(c + i * n + tail_start, tail_length));
simsimd_f32_t a_i = vaddvq_f32(vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a + i, 1)));
float32x4_t b_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b + tail_start, tail_length));
float32x4_t c_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(c + i * n + tail_start, tail_length));
simsimd_f32_t partial_sum = vaddvq_f32(vmulq_f32(b_vec, c_vec));
sum += a_i * partial_sum;
}
Expand Down Expand Up @@ -286,13 +286,13 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_neon(simsimd_f16_t const* a, simsimd
simsimd_size_t tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = vaddvq_f32(vcvt_f32_f16(simsimd_partial_load_f16x4_neon(a + i, 1)));
simsimd_f32_t b_i = vaddvq_f32(vcvt_f32_f16(simsimd_partial_load_f16x4_neon(b + i, 1)));
simsimd_f32_t a_i = vaddvq_f32(vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a + i, 1)));
simsimd_f32_t b_i = vaddvq_f32(vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b + i, 1)));
simsimd_f32_t diff_i = a_i - b_i;
float32x4_t a_j_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(a + tail_start, tail_length));
float32x4_t b_j_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(b + tail_start, tail_length));
float32x4_t a_j_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(a + tail_start, tail_length));
float32x4_t b_j_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(b + tail_start, tail_length));
float32x4_t diff_j_vec = vsubq_f32(a_j_vec, b_j_vec);
float32x4_t c_vec = vcvt_f32_f16(simsimd_partial_load_f16x4_neon(c + i * n + tail_start, tail_length));
float32x4_t c_vec = vcvt_f32_f16(_simsimd_partial_load_f16x4_neon(c + i * n + tail_start, tail_length));
simsimd_f32_t partial_sum = vaddvq_f32(vmulq_f32(diff_j_vec, c_vec));
sum += diff_i * partial_sum;
}
Expand Down Expand Up @@ -333,8 +333,8 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_neon(simsimd_bf16_t const* a, simsimd_
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = simsimd_uncompress_bf16(a + i);
bfloat16x8_t b_vec = simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length);
bfloat16x8_t c_vec = simsimd_partial_load_bf16x8_neon(c + i * n + tail_start, tail_length);
bfloat16x8_t b_vec = _simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length);
bfloat16x8_t c_vec = _simsimd_partial_load_bf16x8_neon(c + i * n + tail_start, tail_length);
simsimd_f32_t partial_sum_high = vaddvq_f32(vbfmlaltq_f32(vdupq_n_f32(0), b_vec, c_vec));
simsimd_f32_t partial_sum_low = vaddvq_f32(vbfmlalbq_f32(vdupq_n_f32(0), b_vec, c_vec));
sum += a_i * (partial_sum_high + partial_sum_low);
Expand Down Expand Up @@ -385,8 +385,8 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_neon(simsimd_bf16_t const* a, simsi
simsimd_f32_t a_i = simsimd_uncompress_bf16(a + i);
simsimd_f32_t b_i = simsimd_uncompress_bf16(b + i);
simsimd_f32_t diff_i = a_i - b_i;
bfloat16x8_t a_j_vec = simsimd_partial_load_bf16x8_neon(a + tail_start, tail_length);
bfloat16x8_t b_j_vec = simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length);
bfloat16x8_t a_j_vec = _simsimd_partial_load_bf16x8_neon(a + tail_start, tail_length);
bfloat16x8_t b_j_vec = _simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length);

// Again, upcast for subtraction
float32x4_t a_j_vec_high = vcvt_f32_bf16(vget_high_bf16(a_j_vec));
Expand All @@ -397,7 +397,7 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_neon(simsimd_bf16_t const* a, simsi
float32x4_t diff_j_vec_low = vsubq_f32(a_j_vec_low, b_j_vec_low);
bfloat16x8_t diff_j_vec = vcombine_bf16(vcvt_bf16_f32(diff_j_vec_low), vcvt_bf16_f32(diff_j_vec_high));

bfloat16x8_t c_vec = simsimd_partial_load_bf16x8_neon(c + i * n + tail_start, tail_length);
bfloat16x8_t c_vec = _simsimd_partial_load_bf16x8_neon(c + i * n + tail_start, tail_length);
simsimd_f32_t partial_sum_high = vaddvq_f32(vbfmlaltq_f32(vdupq_n_f32(0), diff_j_vec, c_vec));
simsimd_f32_t partial_sum_low = vaddvq_f32(vbfmlalbq_f32(vdupq_n_f32(0), diff_j_vec, c_vec));
sum += diff_i * (partial_sum_high + partial_sum_low);
Expand Down Expand Up @@ -434,15 +434,15 @@ SIMSIMD_PUBLIC void simsimd_bilinear_f16_haswell(simsimd_f16_t const* a, simsimd
}

// Handle the tail of every row
simsimd_f64_t sum = _mm256_reduce_add_ps_dbl(sum_vec);
simsimd_f64_t sum = _simsimd_reduce_f32x8_haswell(sum_vec);
simsimd_size_t tail_length = n % 8;
simsimd_size_t tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = _mm256_cvtss_f32(_mm256_cvtph_ps(_mm_set1_epi16(*(short const*)(a + i))));
__m256 b_vec = simsimd_partial_load_f16x8_haswell(b + tail_start, tail_length);
__m256 c_vec = simsimd_partial_load_f16x8_haswell(c + i * n + tail_start, tail_length);
simsimd_f32_t partial_sum = _mm256_reduce_add_ps_dbl(_mm256_mul_ps(b_vec, c_vec));
__m256 b_vec = _simsimd_partial_load_f16x8_haswell(b + tail_start, tail_length);
__m256 c_vec = _simsimd_partial_load_f16x8_haswell(c + i * n + tail_start, tail_length);
simsimd_f32_t partial_sum = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(b_vec, c_vec));
sum += a_i * partial_sum;
}
}
Expand Down Expand Up @@ -470,7 +470,7 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_haswell(simsimd_f16_t const* a, sims
}

// Handle the tail of every row
simsimd_f64_t sum = _mm256_reduce_add_ps_dbl(sum_vec);
simsimd_f64_t sum = _simsimd_reduce_f32x8_haswell(sum_vec);
simsimd_size_t tail_length = n % 8;
simsimd_size_t tail_start = n - tail_length;
if (tail_length) {
Expand All @@ -479,10 +479,10 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_haswell(simsimd_f16_t const* a, sims
_mm256_cvtph_ps(_mm_set1_epi16(*(short const*)(a + i))), //
_mm256_cvtph_ps(_mm_set1_epi16(*(short const*)(b + i)))));
__m256 diff_j_vec = _mm256_sub_ps( //
simsimd_partial_load_f16x8_haswell(a + tail_start, tail_length),
simsimd_partial_load_f16x8_haswell(b + tail_start, tail_length));
__m256 c_vec = simsimd_partial_load_f16x8_haswell(c + i * n + tail_start, tail_length);
simsimd_f32_t partial_sum = _mm256_reduce_add_ps_dbl(_mm256_mul_ps(diff_j_vec, c_vec));
_simsimd_partial_load_f16x8_haswell(a + tail_start, tail_length),
_simsimd_partial_load_f16x8_haswell(b + tail_start, tail_length));
__m256 c_vec = _simsimd_partial_load_f16x8_haswell(c + i * n + tail_start, tail_length);
simsimd_f32_t partial_sum = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(diff_j_vec, c_vec));
sum += diff_i * partial_sum;
}
}
Expand All @@ -495,29 +495,29 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_haswell(simsimd_bf16_t const* a, simsi
simsimd_distance_t* result) {
__m256 sum_vec = _mm256_setzero_ps();
for (simsimd_size_t i = 0; i != n; ++i) {
// The `simsimd_uncompress_bf16` is cheaper than `simsimd_bf16x8_to_f32x8_haswell`
// The `simsimd_uncompress_bf16` is cheaper than `_simsimd_bf16x8_to_f32x8_haswell`
__m256 a_vec = _mm256_set1_ps(simsimd_uncompress_bf16(a + i));
__m256 partial_sum_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 b_vec = simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j)));
__m256 c_vec = simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
__m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j)));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
partial_sum_vec = _mm256_fmadd_ps(b_vec, c_vec, partial_sum_vec);
}
sum_vec = _mm256_fmadd_ps(a_vec, partial_sum_vec, sum_vec);
}

// Handle the tail of every row
simsimd_f64_t sum = _mm256_reduce_add_ps_dbl(sum_vec);
simsimd_f64_t sum = _simsimd_reduce_f32x8_haswell(sum_vec);
simsimd_size_t tail_length = n % 8;
simsimd_size_t tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t a_i = simsimd_uncompress_bf16(a + i);
__m256 b_vec = simsimd_bf16x8_to_f32x8_haswell( //
simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length));
__m256 c_vec = simsimd_bf16x8_to_f32x8_haswell( //
simsimd_partial_load_bf16x8_haswell(c + i * n + tail_start, tail_length));
simsimd_f32_t partial_sum = _mm256_reduce_add_ps_dbl(_mm256_mul_ps(b_vec, c_vec));
__m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell( //
_simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell( //
_simsimd_partial_load_bf16x8_haswell(c + i * n + tail_start, tail_length));
simsimd_f32_t partial_sum = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(b_vec, c_vec));
sum += a_i * partial_sum;
}
}
Expand All @@ -535,28 +535,28 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_haswell(simsimd_bf16_t const* a, si
_mm256_set1_ps(simsimd_uncompress_bf16(b + i)));
__m256 partial_sum_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 diff_j_vec = _mm256_sub_ps( //
simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(a + j))), //
simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j))));
__m256 c_vec = simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
__m256 diff_j_vec = _mm256_sub_ps( //
_simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(a + j))), //
_simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j))));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
partial_sum_vec = _mm256_fmadd_ps(diff_j_vec, c_vec, partial_sum_vec);
}
sum_vec = _mm256_fmadd_ps(diff_i_vec, partial_sum_vec, sum_vec);
}

// Handle the tail of every row
simsimd_f64_t sum = _mm256_reduce_add_ps_dbl(sum_vec);
simsimd_f64_t sum = _simsimd_reduce_f32x8_haswell(sum_vec);
simsimd_size_t tail_length = n % 8;
simsimd_size_t tail_start = n - tail_length;
if (tail_length) {
for (simsimd_size_t i = 0; i != n; ++i) {
simsimd_f32_t diff_i = simsimd_uncompress_bf16(a + i) - simsimd_uncompress_bf16(b + i);
__m256 diff_j_vec = _mm256_sub_ps( //
simsimd_bf16x8_to_f32x8_haswell(simsimd_partial_load_bf16x8_haswell(a + tail_start, tail_length)),
simsimd_bf16x8_to_f32x8_haswell(simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length)));
__m256 c_vec = simsimd_bf16x8_to_f32x8_haswell(
simsimd_partial_load_bf16x8_haswell(c + i * n + tail_start, tail_length));
simsimd_f32_t partial_sum = _mm256_reduce_add_ps_dbl(_mm256_mul_ps(diff_j_vec, c_vec));
_simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(a + tail_start, tail_length)),
_simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length)));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(
_simsimd_partial_load_bf16x8_haswell(c + i * n + tail_start, tail_length));
simsimd_f32_t partial_sum = _simsimd_reduce_f32x8_haswell(_mm256_mul_ps(diff_j_vec, c_vec));
sum += diff_i * partial_sum;
}
}
Expand Down Expand Up @@ -705,7 +705,7 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_genoa(simsimd_bf16_t const* a, sims
b_j_vec = _mm512_maskz_loadu_epi16(tail_mask, b + tail_start);
c_vec = _mm512_maskz_loadu_epi16(tail_mask, c + i * n + tail_start);
}
diff_j_vec = simsimd_substract_bf16x32_genoa(a_j_vec, b_j_vec);
diff_j_vec = _simsimd_substract_bf16x32_genoa(a_j_vec, b_j_vec);
partial_sum_vec = _mm512_dpbf16_ps(partial_sum_vec, (__m512bh)(diff_j_vec), (__m512bh)(c_vec));
j += 32;
if (j < n)
Expand Down
Loading

0 comments on commit 20f4caa

Please sign in to comment.