-
Notifications
You must be signed in to change notification settings - Fork 66
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
89 changed files
with
4,058 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#pragma once | ||
|
||
#include <cstddef> | ||
#include <string> | ||
|
||
#include "model/index.h" | ||
|
||
namespace model::md { | ||
|
||
struct ColumnMatch { | ||
Index left_col_index; | ||
Index right_col_index; | ||
std::string similarity_function_name; | ||
|
||
ColumnMatch(Index left_col_index, Index right_col_index, | ||
std::string similarity_function_name) noexcept | ||
: left_col_index(left_col_index), | ||
right_col_index(right_col_index), | ||
similarity_function_name(std::move(similarity_function_name)) {} | ||
}; | ||
|
||
} // namespace model::md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#pragma once | ||
|
||
#include <cstddef> | ||
|
||
#include "algorithms/md/decision_boundary.h" | ||
#include "model/index.h" | ||
|
||
namespace model::md { | ||
|
||
class ColumnSimilarityClassifier { | ||
private: | ||
Index column_match_index_; | ||
DecisionBoundary decision_boundary_; | ||
|
||
public: | ||
ColumnSimilarityClassifier(Index column_match_index, | ||
DecisionBoundary decision_boundary) noexcept | ||
: column_match_index_(column_match_index), decision_boundary_(decision_boundary) {} | ||
|
||
[[nodiscard]] Index GetColumnMatchIndex() const noexcept { | ||
return column_match_index_; | ||
} | ||
|
||
[[nodiscard]] DecisionBoundary GetDecisionBoundary() const noexcept { | ||
return decision_boundary_; | ||
} | ||
}; | ||
|
||
} // namespace model::md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#pragma once | ||
|
||
namespace model::md { | ||
using DecisionBoundary = double; | ||
} // namespace model::md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#pragma once | ||
|
||
#include "algorithms/md/hymd/indexes/column_similarity_info.h" | ||
#include "model/index.h" | ||
|
||
namespace algos::hymd { | ||
|
||
struct ColumnMatchInfo { | ||
indexes::ColumnMatchSimilarityInfo similarity_info; | ||
model::Index left_column_index; | ||
model::Index right_column_index; | ||
}; | ||
|
||
} // namespace algos::hymd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#pragma once | ||
|
||
#include <vector> | ||
|
||
#include "algorithms/md/hymd/table_identifiers.h" | ||
|
||
namespace algos::hymd { | ||
using CompressedRecord = std::vector<ValueIdentifier>; | ||
} // namespace algos::hymd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#pragma once | ||
|
||
#include <vector> | ||
|
||
#include "algorithms/md/decision_boundary.h" | ||
#include "algorithms/md/hymd/utility/vector_double_hash.h" | ||
|
||
namespace algos::hymd { | ||
using DecisionBoundaryVector = std::vector<model::md::DecisionBoundary>; | ||
} // namespace algos::hymd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
#include "algorithms/md/hymd/hymd.h" | ||
|
||
#include <algorithm> | ||
#include <cstddef> | ||
|
||
#include "algorithms/md/hymd/lattice/cardinality/min_picking_level_getter.h" | ||
#include "algorithms/md/hymd/lattice_traverser.h" | ||
#include "algorithms/md/hymd/preprocessing/similarity_measure/levenshtein_similarity_measure.h" | ||
#include "algorithms/md/hymd/record_pair_inferrer.h" | ||
#include "algorithms/md/hymd/similarity_data.h" | ||
#include "algorithms/md/hymd/utility/md_less.h" | ||
#include "config/names_and_descriptions.h" | ||
#include "config/option_using.h" | ||
#include "model/index.h" | ||
#include "model/table/column.h" | ||
|
||
namespace algos::hymd { | ||
|
||
HyMD::HyMD() : MdAlgorithm({}) { | ||
using namespace config::names; | ||
RegisterOptions(); | ||
MakeOptionsAvailable({kLeftTable, kRightTable}); | ||
} | ||
|
||
void HyMD::MakeExecuteOptsAvailable() { | ||
using namespace config::names; | ||
MakeOptionsAvailable({kMinSupport, kPruneNonDisjoint, kColumnMatches}); | ||
} | ||
|
||
void HyMD::RegisterOptions() { | ||
DESBORDANTE_OPTION_USING; | ||
|
||
auto min_support_default = [this]() { | ||
if (compressed_records_->OneTableGiven()) { | ||
return compressed_records_->GetLeftRecords().GetRecords().size() + 1; | ||
} else { | ||
return std::size_t(1); | ||
} | ||
}; | ||
|
||
auto column_matches_default = [this]() { | ||
std::vector<std::tuple<std::string, std::string, std::shared_ptr<SimilarityMeasureCreator>>> | ||
column_matches_option; | ||
if (compressed_records_->OneTableGiven()) { | ||
std::size_t const num_columns = left_schema_->GetNumColumns(); | ||
for (model::Index i = 0; i < num_columns; ++i) { | ||
std::string const column_name = left_schema_->GetColumn(i)->GetName(); | ||
column_matches_option.emplace_back( | ||
column_name, column_name, | ||
std::make_shared<preprocessing::similarity_measure:: | ||
LevenshteinSimilarityMeasure::Creator>(0.7, true, | ||
0)); | ||
} | ||
} else { | ||
std::size_t const num_columns_left = left_schema_->GetNumColumns(); | ||
std::size_t const num_columns_right = left_schema_->GetNumColumns(); | ||
for (model::Index i = 0; i < num_columns_left; ++i) { | ||
std::string const column_name_left = left_schema_->GetColumn(i)->GetName(); | ||
for (model::Index j = 0; j < num_columns_right; ++j) { | ||
std::string const column_name_right = right_schema_->GetColumn(j)->GetName(); | ||
column_matches_option.emplace_back( | ||
column_name_left, column_name_right, | ||
std::make_shared<preprocessing::similarity_measure:: | ||
LevenshteinSimilarityMeasure::Creator>( | ||
0.7, true, 0)); | ||
} | ||
} | ||
} | ||
return column_matches_option; | ||
}; | ||
|
||
RegisterOption(Option{&left_table_, kLeftTable, kDLeftTable}); | ||
RegisterOption(Option{&right_table_, kRightTable, kDRightTable, config::InputTable{nullptr}}); | ||
|
||
RegisterOption(Option{&min_support_, kMinSupport, kDMinSupport, {min_support_default}}); | ||
RegisterOption(Option{&prune_nondisjoint_, kPruneNonDisjoint, kDPruneNonDisjoint, true}); | ||
RegisterOption(Option{ | ||
&column_matches_option_, kColumnMatches, kDColumnMatches, {column_matches_default}}); | ||
} | ||
|
||
void HyMD::ResetStateMd() {} | ||
|
||
void HyMD::LoadDataInternal() { | ||
left_schema_ = std::make_shared<RelationalSchema>(left_table_->GetRelationName()); | ||
std::size_t const left_table_cols = left_table_->GetNumberOfColumns(); | ||
for (model::Index i = 0; i < left_table_cols; ++i) { | ||
left_schema_->AppendColumn(left_table_->GetColumnName(i)); | ||
} | ||
if (right_table_ == nullptr) { | ||
right_schema_ = left_schema_; | ||
compressed_records_ = indexes::CompressedRecords::CreateFrom(*left_table_); | ||
} else { | ||
right_schema_ = std::make_unique<RelationalSchema>(right_table_->GetRelationName()); | ||
std::size_t const right_table_cols = right_table_->GetNumberOfColumns(); | ||
for (model::Index i = 0; i < right_table_cols; ++i) { | ||
right_schema_->AppendColumn(right_table_->GetColumnName(i)); | ||
} | ||
compressed_records_ = indexes::CompressedRecords::CreateFrom(*left_table_, *right_table_); | ||
} | ||
if (compressed_records_->GetLeftRecords().GetNumberOfRecords() == 0 || | ||
compressed_records_->GetRightRecords().GetNumberOfRecords() == 0) { | ||
throw config::ConfigurationError("MD mining with either table empty is meaningless!"); | ||
} | ||
} | ||
|
||
unsigned long long HyMD::ExecuteInternal() { | ||
auto const start_time = std::chrono::system_clock::now(); | ||
std::vector<std::tuple<std::unique_ptr<preprocessing::similarity_measure::SimilarityMeasure>, | ||
model::Index, model::Index>> | ||
column_matches_info; | ||
for (auto const& [left_column_name, right_column_name, creator] : column_matches_option_) { | ||
column_matches_info.emplace_back(creator->MakeMeasure(), | ||
left_schema_->GetColumn(left_column_name)->GetIndex(), | ||
right_schema_->GetColumn(right_column_name)->GetIndex()); | ||
} | ||
std::size_t const column_match_number = column_matches_info.size(); | ||
assert(column_match_number != 0); | ||
// TODO: make infrastructure for depth level | ||
SimilarityData similarity_data = | ||
SimilarityData::CreateFrom(compressed_records_.get(), std::move(column_matches_info)); | ||
lattice::FullLattice lattice{column_match_number, [](...) { return 1; }}; | ||
Specializer specializer{similarity_data.GetColumnMatchesInfo(), &lattice, prune_nondisjoint_}; | ||
LatticeTraverser lattice_traverser{ | ||
&lattice, | ||
std::make_unique<lattice::cardinality::MinPickingLevelGetter>(&lattice), | ||
{compressed_records_.get(), similarity_data.GetColumnMatchesInfo(), min_support_, | ||
&lattice}, | ||
&specializer}; | ||
RecordPairInferrer record_pair_inferrer{&similarity_data, &lattice, &specializer}; | ||
|
||
bool done = false; | ||
do { | ||
done = record_pair_inferrer.InferFromRecordPairs(lattice_traverser.TakeRecommendations()); | ||
done = lattice_traverser.TraverseLattice(done); | ||
} while (!done); | ||
|
||
RegisterResults(similarity_data, lattice.GetAll()); | ||
|
||
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - | ||
start_time) | ||
.count(); | ||
} | ||
|
||
void HyMD::RegisterResults(SimilarityData const& similarity_data, | ||
std::vector<lattice::MdLatticeNodeInfo> lattice_mds) { | ||
std::size_t const column_match_number = similarity_data.GetColumnMatchNumber(); | ||
std::vector<model::md::ColumnMatch> column_matches; | ||
column_matches.reserve(column_match_number); | ||
for (model::Index column_match_index = 0; column_match_index < column_match_number; | ||
++column_match_index) { | ||
auto [left_col_index, right_col_index] = | ||
similarity_data.GetColMatchIndices(column_match_index); | ||
column_matches.emplace_back(left_col_index, right_col_index, | ||
std::get<2>(column_matches_option_[column_match_index]) | ||
->GetSimilarityMeasureName()); | ||
} | ||
std::vector<model::MD> mds; | ||
for (lattice::MdLatticeNodeInfo const& md : lattice_mds) { | ||
DecisionBoundaryVector& rhs_bounds = *md.rhs_bounds; | ||
for (model::Index rhs_index = 0; rhs_index < column_match_number; ++rhs_index) { | ||
model::md::DecisionBoundary const rhs_bound = rhs_bounds[rhs_index]; | ||
if (rhs_bound == 0.0) continue; | ||
std::vector<model::md::LhsColumnSimilarityClassifier> lhs; | ||
for (model::Index lhs_index = 0; lhs_index < column_match_number; ++lhs_index) { | ||
model::md::DecisionBoundary const lhs_bound = md.lhs_bounds[lhs_index]; | ||
lhs.emplace_back(similarity_data.GetPreviousDecisionBound(lhs_bound, lhs_index), | ||
lhs_index, lhs_bound); | ||
} | ||
model::md::ColumnSimilarityClassifier rhs{rhs_index, rhs_bound}; | ||
mds.emplace_back(left_schema_.get(), right_schema_.get(), column_matches, | ||
std::move(lhs), rhs); | ||
} | ||
} | ||
std::sort(mds.begin(), mds.end(), utility::MdLess); | ||
for (model::MD const& md : mds) { | ||
RegisterMd(md); | ||
} | ||
} | ||
|
||
} // namespace algos::hymd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#pragma once | ||
|
||
#include <cstddef> | ||
#include <memory> | ||
#include <vector> | ||
|
||
#include "algorithms/md/hymd/indexes/compressed_records.h" | ||
#include "algorithms/md/hymd/lattice/full_lattice.h" | ||
#include "algorithms/md/hymd/preprocessing/similarity_measure/similarity_measure.h" | ||
#include "algorithms/md/hymd/similarity_data.h" | ||
#include "algorithms/md/hymd/similarity_measure_creator.h" | ||
#include "algorithms/md/md_algorithm.h" | ||
#include "config/tabular_data/input_table_type.h" | ||
#include "model/table/relational_schema.h" | ||
|
||
namespace algos::hymd { | ||
|
||
class HyMD final : public MdAlgorithm { | ||
private: | ||
config::InputTable left_table_; | ||
config::InputTable right_table_; | ||
|
||
std::shared_ptr<RelationalSchema> left_schema_; | ||
std::shared_ptr<RelationalSchema> right_schema_; | ||
|
||
std::unique_ptr<indexes::CompressedRecords> compressed_records_; | ||
|
||
std::size_t min_support_ = 0; | ||
bool prune_nondisjoint_ = true; | ||
// TODO: thread number limit | ||
// TODO: different level definitions (cardinality currently used) | ||
// TODO: comparing only some values during similarity calculation | ||
// TODO: cardinality limit | ||
// TODO: automatically calculating minimal support | ||
// TODO: limit LHS bounds searched (currently only size limit is implemented) | ||
// TODO: memory conservation mode (load only some columns) | ||
|
||
std::vector<std::tuple<std::string, std::string, std::shared_ptr<SimilarityMeasureCreator>>> | ||
column_matches_option_; | ||
|
||
void RegisterOptions(); | ||
|
||
void LoadDataInternal() final; | ||
|
||
void MakeExecuteOptsAvailable() final; | ||
void ResetStateMd() final; | ||
unsigned long long ExecuteInternal() final; | ||
|
||
void RegisterResults(SimilarityData const& similarity_data, | ||
std::vector<lattice::MdLatticeNodeInfo> lattice_mds); | ||
|
||
public: | ||
HyMD(); | ||
}; | ||
|
||
} // namespace algos::hymd |
18 changes: 18 additions & 0 deletions
18
src/core/algorithms/md/hymd/indexes/column_similarity_info.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#pragma once | ||
|
||
#include <vector> | ||
|
||
#include "algorithms/md/decision_boundary.h" | ||
#include "algorithms/md/hymd/indexes/similarity_index.h" | ||
#include "algorithms/md/hymd/indexes/similarity_matrix.h" | ||
#include "algorithms/md/hymd/preprocessing/similarity.h" | ||
|
||
namespace algos::hymd::indexes { | ||
struct ColumnMatchSimilarityInfo { | ||
std::vector<model::md::DecisionBoundary> lhs_bounds; | ||
preprocessing::Similarity lowest_similarity; | ||
indexes::SimilarityMatrix similarity_matrix; | ||
indexes::SimilarityIndex similarity_index; | ||
// TODO: add slim similarity index | ||
}; | ||
} // namespace algos::hymd::indexes |
Oops, something went wrong.