From 0f9dd97b030def143828ba024776a848e81f246f Mon Sep 17 00:00:00 2001 From: Jae-Seung Yeom Date: Wed, 14 Feb 2024 02:50:35 -0800 Subject: [PATCH] redo the minimum hashable string length --- src/dyad/core/dyad_core.c | 22 +++++------------- src/dyad/utils/test_murmur3.c | 20 +++++----------- src/dyad/utils/utils.c | 43 +++++++++++++++++++++++++---------- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/src/dyad/core/dyad_core.c b/src/dyad/core/dyad_core.c index 71c4ba78..31636ec6 100644 --- a/src/dyad/core/dyad_core.c +++ b/src/dyad/core/dyad_core.c @@ -49,6 +49,7 @@ static int gen_path_key (const char* restrict str, size_t cx = 0ul; int n = 0; size_t str_len = strlen (str); + const char* str_long = str; if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) { DYAD_C_FUNCTION_END(); @@ -56,26 +57,15 @@ static int gen_path_key (const char* restrict str, } path_key[0] = '\0'; - const char* str_long = str; - -#if 1 - // Strings shorter than 128 bytes collide. Especially, the hash value seems - // to depend on the length of such a string. - // For such a short string, we concatenate it as many times as needed to make - // it longer than 128 bytes. + // Just append the string so that it can be as large as 128 bytes. if (str_len < 128ul) { - char buf[PATH_MAX+1] = {'\0'}; + char buf[256] = {'\0'}; memcpy (buf, str, str_len); - char* str_pos = buf + str_len; - const char* const str_min = buf + 128ul; - while (str_pos < str_min) { - memcpy (str_pos, str, str_len); - str_pos += str_len; - }; - str_len = str_pos - buf; + memset (buf + str_len, '-', 128ul - str_len); + buf[128u] = '\0'; + str_len = 128ul; str_long = buf; } -#endif for (uint32_t d = 0u; d < depth; d++) { seed += seeds[d % 10]; diff --git a/src/dyad/utils/test_murmur3.c b/src/dyad/utils/test_murmur3.c index b720fb49..9bb64f78 100644 --- a/src/dyad/utils/test_murmur3.c +++ b/src/dyad/utils/test_murmur3.c @@ -18,29 +18,21 @@ static int gen_path_key (const char* restrict str, size_t cx = 0ul; int n = 0; size_t str_len = strlen (str); + const char* str_long = str; if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) { return -1; } path_key[0] = '\0'; - const char* str_long = str; - #if 1 - // Strings shorter than 128 bytes collide. Especially, the hash value seems - // to depend on the length of such a string. - // For such a short string, we concatenate it as many times as needed to make - // it longer than 128 bytes. + // Just append the string so that it can be as large as 128 bytes. if (str_len < 128ul) { - char buf[PATH_MAX+1] = {'\0'}; + char buf[256] = {'\0'}; memcpy (buf, str, str_len); - char* str_pos = buf + str_len; - const char* const str_min = buf + 128ul; - while (str_pos < str_min) { - memcpy (str_pos, str, str_len); - str_pos += str_len; - }; - str_len = str_pos - buf; + memset (buf + str_len, '@', 128ul - str_len); + buf[128u] = '\0'; + str_len = 128ul; str_long = buf; } #endif diff --git a/src/dyad/utils/utils.c b/src/dyad/utils/utils.c index 4ef940fb..03184052 100644 --- a/src/dyad/utils/utils.c +++ b/src/dyad/utils/utils.c @@ -64,32 +64,51 @@ uint32_t hash_str (const char* str, const uint32_t seed) { if (!str) return 0u; - const size_t len = strlen (str); - if (len == 0ul) return 0u; + const char* str_long = str; + size_t str_len = strlen (str); + if (str_len == 0ul) return 0u; + + // Just append the string so that it can be as large as 128 bytes. + if (str_len < 128ul) { + char buf[256] = {'\0'}; + memcpy (buf, str, str_len); + memset (buf + str_len, '@', 128ul - str_len); + buf[128u] = '\0'; + str_len = 128ul; + str_long = buf; + } uint32_t hash[4] = {0u}; // Output for the hash - MurmurHash3_x64_128 (str, strlen (str), seed, hash); + MurmurHash3_x64_128 (str_long, str_len, seed, hash); return (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) + 1; } /** If hashing is not possible, returns 0. Otherwise, returns a non-zero hash value. - * This does not check if the length of string is correct, but simply use it */ + * This only hashes the prefix of a given length */ uint32_t hash_path_prefix (const char* str, const uint32_t seed, const size_t len) { - char strbuf [PATH_MAX+1] = {'\0'}; - uint32_t hash[4] = {0u}; // Output for the first hash with len1 - if (!str || len == 0ul) { return 0u; } + const char* str_long = str; + size_t str_len = strlen (str); - memcpy (strbuf, str, (len > PATH_MAX)? PATH_MAX : len); - const size_t buf_len = strlen (strbuf); - if (buf_len != len) { - return 0u; + if (str_len < len) return 0u; + str_len = len; + + // Just append the string so that it can be as large as 128 bytes. + if (len < 128ul) { + char buf[256] = {'\0'}; + memcpy (buf, str, len); + memset (buf + len, '@', 128ul - len); + buf[128u] = '\0'; + str_len = 128ul; + str_long = buf; } - MurmurHash3_x64_128 (str, buf_len, seed, hash); + + uint32_t hash[4] = {0u}; // Output for the hash + MurmurHash3_x64_128 (str_long, str_len, seed, hash); return (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) + 1; }