Skip to content

Commit

Permalink
Implement HyMD
Browse files Browse the repository at this point in the history
  • Loading branch information
BUYT-1 committed Jan 17, 2024
1 parent acd3642 commit f7b4671
Show file tree
Hide file tree
Showing 89 changed files with 4,058 additions and 2 deletions.
7 changes: 5 additions & 2 deletions src/core/algorithms/algorithm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace algos {
using AlgorithmTypes =
std::tuple<Depminer, DFD, FastFDs, FDep, Fd_mine, Pyro, Tane, FUN, hyfd::HyFD, Aid, Apriori,
metric::MetricVerifier, DataStats, fd_verifier::FDVerifier, HyUCC, PyroUCC,
cfd::FDFirstAlgorithm, ACAlgorithm, UCCVerifier>;
cfd::FDFirstAlgorithm, ACAlgorithm, UCCVerifier, hymd::HyMD>;

// clang-format off
/* Enumeration of all supported non-pipeline algorithms. If you implement a new
Expand Down Expand Up @@ -54,7 +54,10 @@ BETTER_ENUM(AlgorithmType, char,
ac,

/* UCC verifier algorithm */
ucc_verifier
ucc_verifier,

/* MD mining algorithms */
hymd
)
// clang-format on

Expand Down
3 changes: 3 additions & 0 deletions src/core/algorithms/algorithms.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@

/* UCC verifier */
#include "ucc/ucc_verifier/ucc_verifier.h"

/* Matching dependencies */
#include "algorithms/md/hymd/hymd.h"
22 changes: 22 additions & 0 deletions src/core/algorithms/md/column_match.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#pragma once

#include <cstddef>
#include <string>

#include "model/index.h"

namespace model::md {

struct ColumnMatch {
Index left_col_index;
Index right_col_index;
std::string similarity_function_name;

ColumnMatch(Index left_col_index, Index right_col_index,
std::string similarity_function_name) noexcept
: left_col_index(left_col_index),
right_col_index(right_col_index),
similarity_function_name(std::move(similarity_function_name)) {}
};

} // namespace model::md
29 changes: 29 additions & 0 deletions src/core/algorithms/md/column_similarity_classifier.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

#include <cstddef>

#include "algorithms/md/decision_boundary.h"
#include "model/index.h"

namespace model::md {

class ColumnSimilarityClassifier {
private:
Index column_match_index_;
DecisionBoundary decision_boundary_;

public:
ColumnSimilarityClassifier(Index column_match_index,
DecisionBoundary decision_boundary) noexcept
: column_match_index_(column_match_index), decision_boundary_(decision_boundary) {}

[[nodiscard]] Index GetColumnMatchIndex() const noexcept {
return column_match_index_;
}

[[nodiscard]] DecisionBoundary GetDecisionBoundary() const noexcept {
return decision_boundary_;
}
};

} // namespace model::md
5 changes: 5 additions & 0 deletions src/core/algorithms/md/decision_boundary.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

namespace model::md {
using DecisionBoundary = double;
} // namespace model::md
14 changes: 14 additions & 0 deletions src/core/algorithms/md/hymd/column_match_info.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#pragma once

#include "algorithms/md/hymd/indexes/column_similarity_info.h"
#include "model/index.h"

namespace algos::hymd {

struct ColumnMatchInfo {
indexes::ColumnMatchSimilarityInfo similarity_info;
model::Index left_column_index;
model::Index right_column_index;
};

} // namespace algos::hymd
9 changes: 9 additions & 0 deletions src/core/algorithms/md/hymd/compressed_record.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#pragma once

#include <vector>

#include "algorithms/md/hymd/table_identifiers.h"

namespace algos::hymd {
using CompressedRecord = std::vector<ValueIdentifier>;
} // namespace algos::hymd
10 changes: 10 additions & 0 deletions src/core/algorithms/md/hymd/decision_boundary_vector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once

#include <vector>

#include "algorithms/md/decision_boundary.h"
#include "algorithms/md/hymd/utility/vector_double_hash.h"

namespace algos::hymd {
using DecisionBoundaryVector = std::vector<model::md::DecisionBoundary>;
} // namespace algos::hymd
180 changes: 180 additions & 0 deletions src/core/algorithms/md/hymd/hymd.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#include "algorithms/md/hymd/hymd.h"

#include <algorithm>
#include <cstddef>

#include "algorithms/md/hymd/lattice/cardinality/min_picking_level_getter.h"
#include "algorithms/md/hymd/lattice_traverser.h"
#include "algorithms/md/hymd/preprocessing/similarity_measure/levenshtein_similarity_measure.h"
#include "algorithms/md/hymd/record_pair_inferrer.h"
#include "algorithms/md/hymd/similarity_data.h"
#include "algorithms/md/hymd/utility/md_less.h"
#include "config/names_and_descriptions.h"
#include "config/option_using.h"
#include "model/index.h"
#include "model/table/column.h"

namespace algos::hymd {

HyMD::HyMD() : MdAlgorithm({}) {
using namespace config::names;
RegisterOptions();
MakeOptionsAvailable({kLeftTable, kRightTable});
}

void HyMD::MakeExecuteOptsAvailable() {
using namespace config::names;
MakeOptionsAvailable({kMinSupport, kPruneNonDisjoint, kColumnMatches});
}

void HyMD::RegisterOptions() {
DESBORDANTE_OPTION_USING;

auto min_support_default = [this]() {
if (compressed_records_->OneTableGiven()) {
return compressed_records_->GetLeftRecords().GetRecords().size() + 1;
} else {
return std::size_t(1);
}
};

auto column_matches_default = [this]() {
std::vector<std::tuple<std::string, std::string, std::shared_ptr<SimilarityMeasureCreator>>>
column_matches_option;
if (compressed_records_->OneTableGiven()) {
std::size_t const num_columns = left_schema_->GetNumColumns();
for (model::Index i = 0; i < num_columns; ++i) {
std::string const column_name = left_schema_->GetColumn(i)->GetName();
column_matches_option.emplace_back(
column_name, column_name,
std::make_shared<preprocessing::similarity_measure::
LevenshteinSimilarityMeasure::Creator>(0.7, true,
0));
}
} else {
std::size_t const num_columns_left = left_schema_->GetNumColumns();
std::size_t const num_columns_right = left_schema_->GetNumColumns();
for (model::Index i = 0; i < num_columns_left; ++i) {
std::string const column_name_left = left_schema_->GetColumn(i)->GetName();
for (model::Index j = 0; j < num_columns_right; ++j) {
std::string const column_name_right = right_schema_->GetColumn(j)->GetName();
column_matches_option.emplace_back(
column_name_left, column_name_right,
std::make_shared<preprocessing::similarity_measure::
LevenshteinSimilarityMeasure::Creator>(
0.7, true, 0));
}
}
}
return column_matches_option;
};

RegisterOption(Option{&left_table_, kLeftTable, kDLeftTable});
RegisterOption(Option{&right_table_, kRightTable, kDRightTable, config::InputTable{nullptr}});

RegisterOption(Option{&min_support_, kMinSupport, kDMinSupport, {min_support_default}});
RegisterOption(Option{&prune_nondisjoint_, kPruneNonDisjoint, kDPruneNonDisjoint, true});
RegisterOption(Option{
&column_matches_option_, kColumnMatches, kDColumnMatches, {column_matches_default}});
}

void HyMD::ResetStateMd() {}

void HyMD::LoadDataInternal() {
left_schema_ = std::make_shared<RelationalSchema>(left_table_->GetRelationName());
std::size_t const left_table_cols = left_table_->GetNumberOfColumns();
for (model::Index i = 0; i < left_table_cols; ++i) {
left_schema_->AppendColumn(left_table_->GetColumnName(i));
}
if (right_table_ == nullptr) {
right_schema_ = left_schema_;
compressed_records_ = indexes::CompressedRecords::CreateFrom(*left_table_);
} else {
right_schema_ = std::make_unique<RelationalSchema>(right_table_->GetRelationName());
std::size_t const right_table_cols = right_table_->GetNumberOfColumns();
for (model::Index i = 0; i < right_table_cols; ++i) {
right_schema_->AppendColumn(right_table_->GetColumnName(i));
}
compressed_records_ = indexes::CompressedRecords::CreateFrom(*left_table_, *right_table_);
}
if (compressed_records_->GetLeftRecords().GetNumberOfRecords() == 0 ||
compressed_records_->GetRightRecords().GetNumberOfRecords() == 0) {
throw config::ConfigurationError("MD mining with either table empty is meaningless!");
}
}

unsigned long long HyMD::ExecuteInternal() {
auto const start_time = std::chrono::system_clock::now();
std::vector<std::tuple<std::unique_ptr<preprocessing::similarity_measure::SimilarityMeasure>,
model::Index, model::Index>>
column_matches_info;
for (auto const& [left_column_name, right_column_name, creator] : column_matches_option_) {
column_matches_info.emplace_back(creator->MakeMeasure(),
left_schema_->GetColumn(left_column_name)->GetIndex(),
right_schema_->GetColumn(right_column_name)->GetIndex());
}
std::size_t const column_match_number = column_matches_info.size();
assert(column_match_number != 0);
// TODO: make infrastructure for depth level
SimilarityData similarity_data =
SimilarityData::CreateFrom(compressed_records_.get(), std::move(column_matches_info));
lattice::FullLattice lattice{column_match_number, [](...) { return 1; }};
Specializer specializer{similarity_data.GetColumnMatchesInfo(), &lattice, prune_nondisjoint_};
LatticeTraverser lattice_traverser{
&lattice,
std::make_unique<lattice::cardinality::MinPickingLevelGetter>(&lattice),
{compressed_records_.get(), similarity_data.GetColumnMatchesInfo(), min_support_,
&lattice},
&specializer};
RecordPairInferrer record_pair_inferrer{&similarity_data, &lattice, &specializer};

bool done = false;
do {
done = record_pair_inferrer.InferFromRecordPairs(lattice_traverser.TakeRecommendations());
done = lattice_traverser.TraverseLattice(done);
} while (!done);

RegisterResults(similarity_data, lattice.GetAll());

return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() -
start_time)
.count();
}

void HyMD::RegisterResults(SimilarityData const& similarity_data,
std::vector<lattice::MdLatticeNodeInfo> lattice_mds) {
std::size_t const column_match_number = similarity_data.GetColumnMatchNumber();
std::vector<model::md::ColumnMatch> column_matches;
column_matches.reserve(column_match_number);
for (model::Index column_match_index = 0; column_match_index < column_match_number;
++column_match_index) {
auto [left_col_index, right_col_index] =
similarity_data.GetColMatchIndices(column_match_index);
column_matches.emplace_back(left_col_index, right_col_index,
std::get<2>(column_matches_option_[column_match_index])
->GetSimilarityMeasureName());
}
std::vector<model::MD> mds;
for (lattice::MdLatticeNodeInfo const& md : lattice_mds) {
DecisionBoundaryVector& rhs_bounds = *md.rhs_bounds;
for (model::Index rhs_index = 0; rhs_index < column_match_number; ++rhs_index) {
model::md::DecisionBoundary const rhs_bound = rhs_bounds[rhs_index];
if (rhs_bound == 0.0) continue;
std::vector<model::md::LhsColumnSimilarityClassifier> lhs;
for (model::Index lhs_index = 0; lhs_index < column_match_number; ++lhs_index) {
model::md::DecisionBoundary const lhs_bound = md.lhs_bounds[lhs_index];
lhs.emplace_back(similarity_data.GetPreviousDecisionBound(lhs_bound, lhs_index),
lhs_index, lhs_bound);
}
model::md::ColumnSimilarityClassifier rhs{rhs_index, rhs_bound};
mds.emplace_back(left_schema_.get(), right_schema_.get(), column_matches,
std::move(lhs), rhs);
}
}
std::sort(mds.begin(), mds.end(), utility::MdLess);
for (model::MD const& md : mds) {
RegisterMd(md);
}
}

} // namespace algos::hymd
56 changes: 56 additions & 0 deletions src/core/algorithms/md/hymd/hymd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#pragma once

#include <cstddef>
#include <memory>
#include <vector>

#include "algorithms/md/hymd/indexes/compressed_records.h"
#include "algorithms/md/hymd/lattice/full_lattice.h"
#include "algorithms/md/hymd/preprocessing/similarity_measure/similarity_measure.h"
#include "algorithms/md/hymd/similarity_data.h"
#include "algorithms/md/hymd/similarity_measure_creator.h"
#include "algorithms/md/md_algorithm.h"
#include "config/tabular_data/input_table_type.h"
#include "model/table/relational_schema.h"

namespace algos::hymd {

class HyMD final : public MdAlgorithm {
private:
config::InputTable left_table_;
config::InputTable right_table_;

std::shared_ptr<RelationalSchema> left_schema_;
std::shared_ptr<RelationalSchema> right_schema_;

std::unique_ptr<indexes::CompressedRecords> compressed_records_;

std::size_t min_support_ = 0;
bool prune_nondisjoint_ = true;
// TODO: thread number limit
// TODO: different level definitions (cardinality currently used)
// TODO: comparing only some values during similarity calculation
// TODO: cardinality limit
// TODO: automatically calculating minimal support
// TODO: limit LHS bounds searched (currently only size limit is implemented)
// TODO: memory conservation mode (load only some columns)

std::vector<std::tuple<std::string, std::string, std::shared_ptr<SimilarityMeasureCreator>>>
column_matches_option_;

void RegisterOptions();

void LoadDataInternal() final;

void MakeExecuteOptsAvailable() final;
void ResetStateMd() final;
unsigned long long ExecuteInternal() final;

void RegisterResults(SimilarityData const& similarity_data,
std::vector<lattice::MdLatticeNodeInfo> lattice_mds);

public:
HyMD();
};

} // namespace algos::hymd
18 changes: 18 additions & 0 deletions src/core/algorithms/md/hymd/indexes/column_similarity_info.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include <vector>

#include "algorithms/md/decision_boundary.h"
#include "algorithms/md/hymd/indexes/similarity_index.h"
#include "algorithms/md/hymd/indexes/similarity_matrix.h"
#include "algorithms/md/hymd/preprocessing/similarity.h"

namespace algos::hymd::indexes {
struct ColumnMatchSimilarityInfo {
std::vector<model::md::DecisionBoundary> lhs_bounds;
preprocessing::Similarity lowest_similarity;
indexes::SimilarityMatrix similarity_matrix;
indexes::SimilarityIndex similarity_index;
// TODO: add slim similarity index
};
} // namespace algos::hymd::indexes
Loading

0 comments on commit f7b4671

Please sign in to comment.