Skip to content

Commit

Permalink
Optimize Levenshtein
Browse files Browse the repository at this point in the history
Don't calculate the metric if the length difference between the strings
is too great.
  • Loading branch information
BUYT-1 committed Jul 11, 2024
1 parent 5b3abd2 commit 1de5068
Showing 1 changed file with 10 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ std::size_t GetLevenshteinBufferSize(auto const& right_string) noexcept {
/* An optimized version of the Levenshtein distance computation algorithm from
* https://en.wikipedia.org/wiki/Levenshtein_distance, using preallocated buffers
*/
unsigned LevenshteinDistance(auto const* l_ptr, auto const* r_ptr, unsigned* v0,
unsigned* v1) noexcept {
unsigned LevenshteinDistance(auto const* l_ptr, auto const* r_ptr, unsigned* v0, unsigned* v1,
std::size_t max_dist, std::size_t bad_value) noexcept {
std::size_t r_size = r_ptr->size();
assert(v0 < v1);
assert(GetLevenshteinBufferSize(*r_ptr) == std::size_t(v1 - v0));
Expand All @@ -43,6 +43,9 @@ unsigned LevenshteinDistance(auto const* l_ptr, auto const* r_ptr, unsigned* v0,
std::swap(l_ptr, r_ptr);
std::swap(l_size, r_size);
}
if (l_size - r_size > max_dist) {
return bad_value;
}

auto const& l = *l_ptr;
auto const& r = *r_ptr;
Expand Down Expand Up @@ -137,7 +140,8 @@ indexes::SimilarityMeasureOutput LevenshteinSimilarityMeasure::MakeIndexes(
upgraded */
utility::MakeUniqueForOverwrite<unsigned[]>(buf_size * 2);

auto get_similarity = [&string_left, left_size, &data_info_right, buf1 = buf.get(),
auto get_similarity = [this, &string_left, left_size, &data_info_right,
buf1 = buf.get(),
buf2 = buf.get() + buf_size](ValueIdentifier value_id_right) {
auto const& right_nulls = data_info_right->GetNulls();
if (right_nulls.find(value_id_right) != right_nulls.end()) return kLowestBound;
Expand All @@ -150,9 +154,9 @@ indexes::SimilarityMeasureOutput LevenshteinSimilarityMeasure::MakeIndexes(
// Left has to be second since that's what the function uses to determine the buffer
// size it needs
Similarity value =
static_cast<Similarity>(max_dist - LevenshteinDistance(&string_right,
&string_left, buf1,
buf2)) /
static_cast<Similarity>(
max_dist - LevenshteinDistance(&string_right, &string_left, buf1,
buf2, max_dist * (1 - min_sim_), max_dist)) /
static_cast<Similarity>(max_dist);
return value;
};
Expand Down

0 comments on commit 1de5068

Please sign in to comment.