Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve hash distribution with short strings #109

Merged
merged 2 commits into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions src/dyad/core/dyad_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,28 @@ static int gen_path_key (const char* restrict str,
uint32_t hash[4] = {0u}; // Output for the hash
size_t cx = 0ul;
int n = 0;
size_t str_len = strlen (str);
const char* str_long = str;

if (str == NULL || path_key == NULL || len == 0ul) {
if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) {
DYAD_C_FUNCTION_END();
return -1;
}
path_key[0] = '\0';

// Just append the string so that it can be as large as 128 bytes.
if (str_len < 128ul) {
char buf[256] = {'\0'};
memcpy (buf, str, str_len);
memset (buf + str_len, '@', 128ul - str_len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}

for (uint32_t d = 0u; d < depth; d++) {
seed += seeds[d % 10];
// TODO add assert that str is not NULL
MurmurHash3_x64_128 (str, strlen (str), seed, hash);
MurmurHash3_x64_128 (str_long, str_len, seed, hash);
uint32_t bin = (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) % width;
n = snprintf (path_key + cx, len - cx, "%x.", bin);
cx += n;
Expand Down
8 changes: 8 additions & 0 deletions src/dyad/utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ add_executable(test_cmp_canonical_path_prefix test_cmp_canonical_path_prefix.c
${CMAKE_CURRENT_SOURCE_DIR}/../common/dyad_structures.h)
target_compile_definitions(test_cmp_canonical_path_prefix PUBLIC DYAD_HAS_CONFIG)
target_link_libraries(test_cmp_canonical_path_prefix PUBLIC ${PROJECT_NAME}_utils)

add_executable(test_murmur3 test_murmur3.c)
target_compile_definitions(test_murmur3 PUBLIC DYAD_HAS_CONFIG)
target_link_libraries(test_murmur3 PUBLIC ${PROJECT_NAME}_murmur3)

if(DYAD_LOGGER STREQUAL "CPP_LOGGER")
target_link_libraries(test_cmp_canonical_path_prefix PRIVATE ${CPP_LOGGER_LIBRARIES})
endif()
Expand All @@ -49,6 +54,9 @@ endif()

if (TARGET DYAD_C_FLAGS_werror)
target_link_libraries(${PROJECT_NAME}_utils PRIVATE DYAD_C_FLAGS_werror)
target_link_libraries(${PROJECT_NAME}_murmur3 PRIVATE DYAD_C_FLAGS_werror)
target_link_libraries(test_murmur3 PRIVATE DYAD_C_FLAGS_werror)
target_link_libraries(test_cmp_canonical_path_prefix PRIVATE DYAD_C_FLAGS_werror)
endif ()

install(
Expand Down
76 changes: 76 additions & 0 deletions src/dyad/utils/test_murmur3.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include "dyad/utils/murmur3.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>

static int gen_path_key (const char* restrict str,
char* restrict path_key,
const size_t len,
const uint32_t depth,
const uint32_t width)
{
static const uint32_t seeds[10] =
{104677u, 104681u, 104683u, 104693u, 104701u, 104707u, 104711u, 104717u, 104723u, 104729u};

uint32_t seed = 57u;
uint32_t hash[4] = {0u}; // Output for the hash
size_t cx = 0ul;
int n = 0;
size_t str_len = strlen (str);
const char* str_long = str;

if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) {
return -1;
}
path_key[0] = '\0';

#if 1
// Just append the string so that it can be as large as 128 bytes.
if (str_len < 128ul) {
char buf[256] = {'\0'};
memcpy (buf, str, str_len);
memset (buf + str_len, '@', 128ul - str_len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}
#endif

for (uint32_t d = 0u; d < depth; d++) {
seed += seeds[d % 10];
MurmurHash3_x64_128 (str_long, str_len, seed, hash);
uint32_t bin = (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) % width;
n = snprintf (path_key + cx, len - cx, "%x.", bin);
//n = snprintf (path_key + cx, len - cx, "%x%x%x%x.", hash[0], hash[1], hash[2], hash[3]);
cx += n;
if (cx >= len || n < 0) {
return -1;
}
}
n = snprintf (path_key + cx, len - cx, "%s", str);
if (cx + n >= len || n < 0) {
return -1;
}

return 0;
}


int main (int argc, char** argv)
{
if (argc < 4) {
printf ("Usage: %s depth width str1 [str2 [str3 ...]]\n", argv[0]);
return EXIT_FAILURE;
}

int depth = atoi (argv[1]);
int width = atoi (argv[2]);
for (int i = 3; i < argc; i++) {
char path_key [PATH_MAX + 1] = {'\0'};
gen_path_key (argv[i], path_key, PATH_MAX, depth, width);
printf("%s\t%s\n", argv[i], path_key);
}

return EXIT_SUCCESS;
}
43 changes: 31 additions & 12 deletions src/dyad/utils/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,32 +64,51 @@
uint32_t hash_str (const char* str, const uint32_t seed)
{
if (!str) return 0u;
const size_t len = strlen (str);
if (len == 0ul) return 0u;
const char* str_long = str;
size_t str_len = strlen (str);
if (str_len == 0ul) return 0u;

// Just append the string so that it can be as large as 128 bytes.
if (str_len < 128ul) {
char buf[256] = {'\0'};
memcpy (buf, str, str_len);
memset (buf + str_len, '@', 128ul - str_len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}

uint32_t hash[4] = {0u}; // Output for the hash
MurmurHash3_x64_128 (str, strlen (str), seed, hash);
MurmurHash3_x64_128 (str_long, str_len, seed, hash);
return (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) + 1;
}

/** If hashing is not possible, returns 0. Otherwise, returns a non-zero hash value.
* This does not check if the length of string is correct, but simply use it */
* This only hashes the prefix of a given length */
uint32_t hash_path_prefix (const char* str, const uint32_t seed,
const size_t len)
{
char strbuf [PATH_MAX+1] = {'\0'};
uint32_t hash[4] = {0u}; // Output for the first hash with len1

if (!str || len == 0ul) {
return 0u;
}
const char* str_long = str;
size_t str_len = strlen (str);

memcpy (strbuf, str, (len > PATH_MAX)? PATH_MAX : len);
const size_t buf_len = strlen (strbuf);
if (buf_len != len) {
return 0u;
if (str_len < len) return 0u;
str_len = len;

// Just append the string so that it can be as large as 128 bytes.
if (len < 128ul) {
char buf[256] = {'\0'};
memcpy (buf, str, len);
memset (buf + len, '@', 128ul - len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}
MurmurHash3_x64_128 (str, buf_len, seed, hash);

uint32_t hash[4] = {0u}; // Output for the hash
MurmurHash3_x64_128 (str_long, str_len, seed, hash);
return (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) + 1;
}

Expand Down
Loading