Skip to content

Commit

Permalink
redo the minimum hashable string length
Browse files Browse the repository at this point in the history
  • Loading branch information
JaeseungYeom committed Feb 14, 2024
1 parent 49fe0b0 commit 0f9dd97
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 42 deletions.
22 changes: 6 additions & 16 deletions src/dyad/core/dyad_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,33 +49,23 @@ static int gen_path_key (const char* restrict str,
size_t cx = 0ul;
int n = 0;
size_t str_len = strlen (str);
const char* str_long = str;

if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) {
DYAD_C_FUNCTION_END();
return -1;
}
path_key[0] = '\0';

const char* str_long = str;

#if 1
// Strings shorter than 128 bytes collide. Especially, the hash value seems
// to depend on the length of such a string.
// For such a short string, we concatenate it as many times as needed to make
// it longer than 128 bytes.
// Just append the string so that it can be as large as 128 bytes.
if (str_len < 128ul) {
char buf[PATH_MAX+1] = {'\0'};
char buf[256] = {'\0'};
memcpy (buf, str, str_len);
char* str_pos = buf + str_len;
const char* const str_min = buf + 128ul;
while (str_pos < str_min) {
memcpy (str_pos, str, str_len);
str_pos += str_len;
};
str_len = str_pos - buf;
memset (buf + str_len, '-', 128ul - str_len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}
#endif

for (uint32_t d = 0u; d < depth; d++) {
seed += seeds[d % 10];
Expand Down
20 changes: 6 additions & 14 deletions src/dyad/utils/test_murmur3.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,21 @@ static int gen_path_key (const char* restrict str,
size_t cx = 0ul;
int n = 0;
size_t str_len = strlen (str);
const char* str_long = str;

if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) {
return -1;
}
path_key[0] = '\0';

const char* str_long = str;

#if 1
// Strings shorter than 128 bytes collide. Especially, the hash value seems
// to depend on the length of such a string.
// For such a short string, we concatenate it as many times as needed to make
// it longer than 128 bytes.
// Just append the string so that it can be as large as 128 bytes.
if (str_len < 128ul) {
char buf[PATH_MAX+1] = {'\0'};
char buf[256] = {'\0'};
memcpy (buf, str, str_len);
char* str_pos = buf + str_len;
const char* const str_min = buf + 128ul;
while (str_pos < str_min) {
memcpy (str_pos, str, str_len);
str_pos += str_len;
};
str_len = str_pos - buf;
memset (buf + str_len, '@', 128ul - str_len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}
#endif
Expand Down
43 changes: 31 additions & 12 deletions src/dyad/utils/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,32 +64,51 @@
uint32_t hash_str (const char* str, const uint32_t seed)
{
if (!str) return 0u;
const size_t len = strlen (str);
if (len == 0ul) return 0u;
const char* str_long = str;
size_t str_len = strlen (str);
if (str_len == 0ul) return 0u;

// Just append the string so that it can be as large as 128 bytes.
if (str_len < 128ul) {
char buf[256] = {'\0'};
memcpy (buf, str, str_len);
memset (buf + str_len, '@', 128ul - str_len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}

uint32_t hash[4] = {0u}; // Output for the hash
MurmurHash3_x64_128 (str, strlen (str), seed, hash);
MurmurHash3_x64_128 (str_long, str_len, seed, hash);
return (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) + 1;
}

/** If hashing is not possible, returns 0. Otherwise, returns a non-zero hash value.
* This does not check if the length of string is correct, but simply use it */
* This only hashes the prefix of a given length */
uint32_t hash_path_prefix (const char* str, const uint32_t seed,
const size_t len)
{
char strbuf [PATH_MAX+1] = {'\0'};
uint32_t hash[4] = {0u}; // Output for the first hash with len1

if (!str || len == 0ul) {
return 0u;
}
const char* str_long = str;
size_t str_len = strlen (str);

memcpy (strbuf, str, (len > PATH_MAX)? PATH_MAX : len);
const size_t buf_len = strlen (strbuf);
if (buf_len != len) {
return 0u;
if (str_len < len) return 0u;
str_len = len;

// Just append the string so that it can be as large as 128 bytes.
if (len < 128ul) {
char buf[256] = {'\0'};
memcpy (buf, str, len);
memset (buf + len, '@', 128ul - len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}
MurmurHash3_x64_128 (str, buf_len, seed, hash);

uint32_t hash[4] = {0u}; // Output for the hash
MurmurHash3_x64_128 (str_long, str_len, seed, hash);
return (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) + 1;
}

Expand Down

0 comments on commit 0f9dd97

Please sign in to comment.