Skip to content

Commit

Permalink
Merge branch 'main' into main-dev
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Sep 17, 2024
2 parents 7e8cdba + adce71b commit 4a2ed3a
Show file tree
Hide file tree
Showing 14 changed files with 723 additions and 251 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(
simsimd
VERSION 5.2.1
VERSION 5.3.0
LANGUAGES C CXX
DESCRIPTION "Fastest SIMD-Accelerated Vector Similarity Functions for x86 and Arm"
HOMEPAGE_URL "https://github.com/ashvardanian/simsimd"
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "simsimd"
description = "Fastest SIMD-Accelerated Vector Similarity Functions for x86 and Arm"
version = "5.2.1"
version = "5.3.0"
edition = "2021"
license = "Apache-2.0"
authors = ["Ash Vardanian <[email protected]>"]
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -690,7 +690,7 @@ To explicitly disable half-precision support, define the following macro before
> But if you are running on different generations of devices, it makes sense to pre-compile the library for all supported generations at once, and dispatch at runtime.
> This flag does just that and is used to produce the `simsimd.so` shared library, as well as the Python and other bindings.
`SIMSIMD_TARGET_ARM` (`SIMSIMD_TARGET_NEON`, `SIMSIMD_TARGET_SVE`, `SIMSIMD_TARGET_NEON_F16`, `SIMSIMD_TARGET_SVE_F16`, `SIMSIMD_TARGET_NEON_BF16`, `SIMSIMD_TARGET_SVE_BF16`), `SIMSIMD_TARGET_X86` (`SIMSIMD_TARGET_HASWELL`, `SIMSIMD_TARGET_SKYLAKE`, `SIMSIMD_TARGET_ICE`, `SIMSIMD_TARGET_GENOA`, `SIMSIMD_TARGET_SAPPHIRE`):
`SIMSIMD_TARGET_ARM` (`SIMSIMD_TARGET_NEON`, `SIMSIMD_TARGET_SVE`, `SIMSIMD_TARGET_SVE2`, `SIMSIMD_TARGET_NEON_F16`, `SIMSIMD_TARGET_SVE_F16`, `SIMSIMD_TARGET_NEON_BF16`, `SIMSIMD_TARGET_SVE_BF16`), `SIMSIMD_TARGET_X86` (`SIMSIMD_TARGET_HASWELL`, `SIMSIMD_TARGET_SKYLAKE`, `SIMSIMD_TARGET_ICE`, `SIMSIMD_TARGET_GENOA`, `SIMSIMD_TARGET_SAPPHIRE`):
> By default, SimSIMD automatically infers the target architecture and pre-compiles as many kernels as possible.
> In some cases, you may want to explicitly disable some of the kernels.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
5.2.1
5.3.0
12 changes: 11 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,17 @@ fn main() {

let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
let flags_to_try = match target_arch.as_str() {
"arm" | "aarch64" => vec!["SIMSIMD_TARGET_NEON", "SIMSIMD_TARGET_SVE"],
"arm" | "aarch64" => vec![
"SIMSIMD_TARGET_SVE2",
"SIMSIMD_TARGET_SVE_BF16",
"SIMSIMD_TARGET_SVE_F16",
"SIMSIMD_TARGET_SVE_I8",
"SIMSIMD_TARGET_SVE",
"SIMSIMD_TARGET_NEON_BF16",
"SIMSIMD_TARGET_NEON_F16",
"SIMSIMD_TARGET_NEON_I8",
"SIMSIMD_TARGET_NEON",
],
_ => vec![
"SIMSIMD_TARGET_SAPPHIRE",
"SIMSIMD_TARGET_GENOA",
Expand Down
3 changes: 3 additions & 0 deletions c/lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
#if !defined(SIMSIMD_TARGET_SVE) && (defined(__linux__))
#define SIMSIMD_TARGET_SVE 1
#endif
#if !defined(SIMSIMD_TARGET_SVE2) && (defined(__linux__))
#define SIMSIMD_TARGET_SVE2 1
#endif
#if !defined(SIMSIMD_TARGET_HASWELL) && (defined(_MSC_VER) || defined(__APPLE__) || defined(__linux__))
#define SIMSIMD_TARGET_HASWELL 1
#endif
Expand Down
41 changes: 25 additions & 16 deletions cpp/bench.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ void measure_sparse(bm::State& state, metric_at metric, metric_at baseline, std:
mean_error /= pairs.size();
state.counters["error"] = mean_error;
state.counters["bytes"] =
bm::Counter(iterations * pairs[0].a.size_bytes() * pairs[0].b.size_bytes(), bm::Counter::kIsRate);
bm::Counter(iterations * (pairs[0].a.size_bytes() + pairs[0].b.size_bytes()), bm::Counter::kIsRate);
state.counters["pairs"] = bm::Counter(iterations, bm::Counter::kIsRate);
state.counters["matches"] =
std::accumulate(results_contender.begin(), results_contender.end(), 0.0) / results_contender.size();
Expand All @@ -482,8 +482,8 @@ void measure_sparse(bm::State& state, metric_at metric, metric_at baseline, std:
template <simsimd_datatype_t datatype_ak, typename metric_at = void>
void dense_(std::string name, metric_at* distance_func, metric_at* baseline_func) {
using pair_t = vectors_pair_gt<datatype_ak>;
std::string name_dims = name + "_" + std::to_string(dense_dimensions) + "d";
bm::RegisterBenchmark(name_dims.c_str(), measure_dense<pair_t, metric_at*>, distance_func, baseline_func,
std::string bench_name = name + "<" + std::to_string(dense_dimensions) + "d>";
bm::RegisterBenchmark(bench_name.c_str(), measure_dense<pair_t, metric_at*>, distance_func, baseline_func,
dense_dimensions)
->MinTime(default_seconds)
->Threads(default_threads);
Expand All @@ -495,15 +495,18 @@ void sparse_(std::string name, metric_at* distance_func, metric_at* baseline_fun
using pair_t = vectors_pair_gt<datatype_ak>;

// Register different lengths, intersection sizes, and distributions
// 2 first lengths * 3 second lengths * 3 intersection sizes = 18 benchmarks for each metric.
for (std::size_t first_len : {128, 1024}) { //< 2 lengths
for (std::size_t second_len_multiplier : {1, 8, 64}) { //< 3 lengths
for (std::size_t intersection_size : {1, 8, 64}) { //< 3 sizes

// 2 first lengths * 3 second length multipliers * 4 intersection grades = 24 benchmarks for each metric.
for (std::size_t first_len : {128, 1024}) { //< 2 lengths
for (std::size_t second_len_multiplier : {1, 8, 64}) { //< 3 length multipliers
for (double intersection_share : {0.01, 0.05, 0.5, 0.95}) { //< 4 intersection grades
std::size_t intersection_size = static_cast<std::size_t>(first_len * intersection_share);
std::size_t second_len = first_len * second_len_multiplier;
std::string test_name = name + "_" + std::to_string(first_len) + "d^" + std::to_string(second_len) +
"d_w" + std::to_string(intersection_size) + "matches";
bm::RegisterBenchmark(test_name.c_str(), measure_sparse<pair_t, metric_at*>, distance_func,
std::string bench_name = name + "<|A|=" + std::to_string(first_len) +
",|B|=" + std::to_string(second_len) +
",|A∩B|=" + std::to_string(intersection_size) + ">";
if (second_len > 8192)
continue;
bm::RegisterBenchmark(bench_name.c_str(), measure_sparse<pair_t, metric_at*>, distance_func,
baseline_func, first_len, second_len, intersection_size)
->MinTime(default_seconds)
->Threads(default_threads);
Expand All @@ -516,8 +519,8 @@ template <simsimd_datatype_t datatype_ak, typename metric_at = void>
void curved_(std::string name, metric_at* distance_func, metric_at* baseline_func) {

using pair_t = vectors_pair_gt<datatype_ak>;
std::string name_dims = name + "_" + std::to_string(curved_dimensions) + "d";
bm::RegisterBenchmark(name_dims.c_str(), measure_curved<pair_t, metric_at*>, distance_func, baseline_func,
std::string bench_name = name + "<" + std::to_string(curved_dimensions) + "d>";
bm::RegisterBenchmark(bench_name.c_str(), measure_curved<pair_t, metric_at*>, distance_func, baseline_func,
curved_dimensions)
->MinTime(default_seconds)
->Threads(default_threads);
Expand Down Expand Up @@ -570,6 +573,7 @@ int main(int argc, char** argv) {
std::printf("Compile-time settings:\n");
std::printf("- Arm NEON support enabled: %s\n", flags[SIMSIMD_TARGET_NEON]);
std::printf("- Arm SVE support enabled: %s\n", flags[SIMSIMD_TARGET_SVE]);
std::printf("- Arm SVE2 support enabled: %s\n", flags[SIMSIMD_TARGET_SVE2]);
std::printf("- x86 Haswell support enabled: %s\n", flags[SIMSIMD_TARGET_HASWELL]);
std::printf("- x86 Skylake support enabled: %s\n", flags[SIMSIMD_TARGET_SKYLAKE]);
std::printf("- x86 Ice Lake support enabled: %s\n", flags[SIMSIMD_TARGET_ICE]);
Expand All @@ -585,6 +589,7 @@ int main(int argc, char** argv) {
std::printf("- Arm SVE F16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_f16_k) != 0]);
std::printf("- Arm SVE BF16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_bf16_k) != 0]);
std::printf("- Arm SVE I8 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_i8_k) != 0]);
std::printf("- Arm SVE2 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve2_k) != 0]);
std::printf("- x86 Haswell support enabled: %s\n", flags[(runtime_caps & simsimd_cap_haswell_k) != 0]);
std::printf("- x86 Skylake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_skylake_k) != 0]);
std::printf("- x86 Ice Lake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_ice_k) != 0]);
Expand Down Expand Up @@ -663,6 +668,9 @@ int main(int argc, char** argv) {
curved_<f16_k>("mahalanobis_f16_neon", simsimd_mahalanobis_f16_neon, simsimd_mahalanobis_f16_accurate);
curved_<bf16_k>("bilinear_bf16_neon", simsimd_bilinear_bf16_neon, simsimd_bilinear_bf16_accurate);
curved_<bf16_k>("mahalanobis_bf16_neon", simsimd_mahalanobis_bf16_neon, simsimd_mahalanobis_bf16_accurate);

sparse_<u16_k>("intersect_u16_neon", simsimd_intersect_u16_neon, simsimd_intersect_u16_accurate);
sparse_<u32_k>("intersect_u32_neon", simsimd_intersect_u32_neon, simsimd_intersect_u32_accurate);
#endif

#if SIMSIMD_TARGET_SVE
Expand Down Expand Up @@ -690,10 +698,11 @@ int main(int argc, char** argv) {
dense_<f32c_k>("vdot_f32c_sve", simsimd_vdot_f32c_sve, simsimd_vdot_f32c_accurate);
dense_<f64c_k>("dot_f64c_sve", simsimd_dot_f64c_sve, simsimd_dot_f64c_serial);
dense_<f64c_k>("vdot_f64c_sve", simsimd_vdot_f64c_sve, simsimd_vdot_f64c_serial);
#endif

sparse_<u16_k>("intersect_u16_sve", simsimd_intersect_u16_sve, simsimd_intersect_u16_accurate);
sparse_<u32_k>("intersect_u32_sve", simsimd_intersect_u32_sve, simsimd_intersect_u32_accurate);

#if SIMSIMD_TARGET_SVE2
sparse_<u16_k>("intersect_u16_sve2", simsimd_intersect_u16_sve2, simsimd_intersect_u16_accurate);
sparse_<u32_k>("intersect_u32_sve2", simsimd_intersect_u32_sve2, simsimd_intersect_u32_accurate);
#endif

#if SIMSIMD_TARGET_HASWELL
Expand Down
2 changes: 2 additions & 0 deletions cpp/test.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ void print_capabilities(void) {
printf("Compile-time settings:\n");
printf("- Arm NEON support enabled: %s\n", flags[SIMSIMD_TARGET_NEON]);
printf("- Arm SVE support enabled: %s\n", flags[SIMSIMD_TARGET_SVE]);
printf("- Arm SVE2 support enabled: %s\n", flags[SIMSIMD_TARGET_SVE2]);
printf("- x86 Haswell support enabled: %s\n", flags[SIMSIMD_TARGET_HASWELL]);
printf("- x86 Skylake support enabled: %s\n", flags[SIMSIMD_TARGET_SKYLAKE]);
printf("- x86 Ice Lake support enabled: %s\n", flags[SIMSIMD_TARGET_ICE]);
Expand All @@ -43,6 +44,7 @@ void print_capabilities(void) {
printf("- Arm SVE F16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_f16_k) != 0]);
printf("- Arm SVE BF16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_bf16_k) != 0]);
printf("- Arm SVE I8 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_i8_k) != 0]);
printf("- Arm SVE2 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve2_k) != 0]);
printf("- x86 Haswell support enabled: %s\n", flags[(runtime_caps & simsimd_cap_haswell_k) != 0]);
printf("- x86 Skylake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_skylake_k) != 0]);
printf("- x86 Ice Lake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_ice_k) != 0]);
Expand Down
48 changes: 36 additions & 12 deletions include/simsimd/simsimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@
#define SIMSIMD_H

#define SIMSIMD_VERSION_MAJOR 5
#define SIMSIMD_VERSION_MINOR 2
#define SIMSIMD_VERSION_PATCH 1
#define SIMSIMD_VERSION_MINOR 3
#define SIMSIMD_VERSION_PATCH 0

/**
* @brief Removes compile-time dispatching, and replaces it with runtime dispatching.
Expand Down Expand Up @@ -181,6 +181,8 @@ typedef enum {
simsimd_cap_sve_f16_k = 1 << 25, ///< ARM SVE `f16` capability
simsimd_cap_sve_bf16_k = 1 << 26, ///< ARM SVE `bf16` capability
simsimd_cap_sve_i8_k = 1 << 27, ///< ARM SVE `i8` capability
simsimd_cap_sve2_k = 1 << 28, ///< ARM SVE2 capability
simsimd_cap_sve2p1_k = 1 << 29, ///< ARM SVE2p1 capability

} simsimd_capability_t;

Expand Down Expand Up @@ -437,8 +439,8 @@ SIMSIMD_PUBLIC simsimd_capability_t simsimd_capabilities_arm(void) {
// - 0b0001: SVE2 is implemented
// - 0b0010: SVE2.1 is implemented
// This value must match the existing indicator obtained from ID_AA64PFR0_EL1:
// unsigned supports_sve = ((id_aa64zfr0_el1) & 0xF) >= 1;
// unsigned supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 2;
unsigned supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
unsigned supports_sve2p1 = ((id_aa64zfr0_el1) & 0xF) >= 2;
unsigned supports_neon = 1; // NEON is always supported

return (simsimd_capability_t)( //
Expand All @@ -450,6 +452,8 @@ SIMSIMD_PUBLIC simsimd_capability_t simsimd_capabilities_arm(void) {
(simsimd_cap_sve_f16_k * (supports_sve && supports_fp16)) | //
(simsimd_cap_sve_bf16_k * (supports_sve && supports_sve_bf16)) | //
(simsimd_cap_sve_i8_k * (supports_sve && supports_sve_i8mm)) | //
(simsimd_cap_sve2_k * (supports_sve2)) | //
(simsimd_cap_sve2p1_k * (supports_sve2p1)) | //
(simsimd_cap_serial_k));
#else // SIMSIMD_DEFINED_LINUX
return simsimd_cap_serial_k;
Expand Down Expand Up @@ -1003,10 +1007,17 @@ SIMSIMD_PUBLIC void simsimd_find_metric_punned( //
// Unsigned 16-bit integer vectors
case simsimd_datatype_u16_k: {

#if SIMSIMD_TARGET_SVE
#if SIMSIMD_TARGET_SVE2
if (viable & simsimd_cap_sve_k)
switch (kind) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_sve, *c = simsimd_cap_sve_k; return;
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_sve2, *c = simsimd_cap_sve_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON
if (viable & simsimd_cap_neon_k)
switch (kind) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_neon, *c = simsimd_cap_neon_k; return;
default: break;
}
#endif
Expand All @@ -1028,10 +1039,17 @@ SIMSIMD_PUBLIC void simsimd_find_metric_punned( //
// Unsigned 32-bit integer vectors
case simsimd_datatype_u32_k: {

#if SIMSIMD_TARGET_SVE
#if SIMSIMD_TARGET_SVE2
if (viable & simsimd_cap_sve_k)
switch (kind) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u32_sve, *c = simsimd_cap_sve_k; return;
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u32_sve2, *c = simsimd_cap_sve_k; return;
default: break;
}
#endif
#if SIMSIMD_TARGET_NEON
if (viable & simsimd_cap_neon_k)
switch (kind) {
case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u32_neon, *c = simsimd_cap_neon_k; return;
default: break;
}
#endif
Expand Down Expand Up @@ -1106,6 +1124,7 @@ SIMSIMD_DYNAMIC int simsimd_uses_sve(void);
SIMSIMD_DYNAMIC int simsimd_uses_sve_f16(void);
SIMSIMD_DYNAMIC int simsimd_uses_sve_bf16(void);
SIMSIMD_DYNAMIC int simsimd_uses_sve_i8(void);
SIMSIMD_DYNAMIC int simsimd_uses_sve2(void);
SIMSIMD_DYNAMIC int simsimd_uses_haswell(void);
SIMSIMD_DYNAMIC int simsimd_uses_skylake(void);
SIMSIMD_DYNAMIC int simsimd_uses_ice(void);
Expand Down Expand Up @@ -1260,6 +1279,7 @@ SIMSIMD_PUBLIC int simsimd_uses_sve(void) { return SIMSIMD_TARGET_ARM && SIMSIMD
SIMSIMD_PUBLIC int simsimd_uses_sve_f16(void) { return SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE && SIMSIMD_NATIVE_F16; }
SIMSIMD_PUBLIC int simsimd_uses_sve_bf16(void) { return SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE && SIMSIMD_NATIVE_BF16; }
SIMSIMD_PUBLIC int simsimd_uses_sve_i8(void) { return SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE; }
SIMSIMD_PUBLIC int simsimd_uses_sve2(void) { return SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE2; }
SIMSIMD_PUBLIC int simsimd_uses_haswell(void) { return SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_HASWELL; }
SIMSIMD_PUBLIC int simsimd_uses_skylake(void) { return SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_SKYLAKE; }
SIMSIMD_PUBLIC int simsimd_uses_ice(void) { return SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_ICE; }
Expand Down Expand Up @@ -1697,8 +1717,10 @@ SIMSIMD_PUBLIC void simsimd_js_f64(simsimd_f64_t const* a, simsimd_f64_t const*
*/
SIMSIMD_PUBLIC void simsimd_intersect_u16(simsimd_u16_t const* a, simsimd_u16_t const* b, simsimd_size_t a_length,
simsimd_size_t b_length, simsimd_distance_t* d) {
#if SIMSIMD_TARGET_SVE
simsimd_intersect_u16_sve(a, b, a_length, b_length, d);
#if SIMSIMD_TARGET_SVE2
simsimd_intersect_u16_sve2(a, b, a_length, b_length, d);
#elif SIMSIMD_TARGET_NEON
simsimd_intersect_u16_neon(a, b, a_length, b_length, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_intersect_u16_ice(a, b, a_length, b_length, d);
#else
Expand All @@ -1708,8 +1730,10 @@ SIMSIMD_PUBLIC void simsimd_intersect_u16(simsimd_u16_t const* a, simsimd_u16_t

SIMSIMD_PUBLIC void simsimd_intersect_u32(simsimd_u32_t const* a, simsimd_u32_t const* b, simsimd_size_t a_length,
simsimd_size_t b_length, simsimd_distance_t* d) {
#if SIMSIMD_TARGET_SVE
simsimd_intersect_u32_sve(a, b, a_length, b_length, d);
#if SIMSIMD_TARGET_SVE2
simsimd_intersect_u32_sve2(a, b, a_length, b_length, d);
#elif SIMSIMD_TARGET_NEON
simsimd_intersect_u32_neon(a, b, a_length, b_length, d);
#elif SIMSIMD_TARGET_SKYLAKE
simsimd_intersect_u32_ice(a, b, a_length, b_length, d);
#else
Expand Down
Loading

0 comments on commit 4a2ed3a

Please sign in to comment.