Skip to content

Commit

Permalink
complex-multimer DBSCAN earlystop with maxClusterNum
Browse files Browse the repository at this point in the history
  • Loading branch information
Woosub-Kim committed Aug 3, 2024
1 parent 04876ca commit 16dc915
Showing 1 changed file with 31 additions and 31 deletions.
62 changes: 31 additions & 31 deletions src/strucclustutils/scoremultimer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,12 @@ class DBSCANCluster {
public:
DBSCANCluster(SearchResult &searchResult, std::set<cluster_t> &finalClusters, double minCov) : searchResult(searchResult), finalClusters(finalClusters) {
cLabel = 0;
clusterSizeThr = std::max(MULTIPLE_CHAINED_COMPLEX, (unsigned int) ((double) searchResult.qChainKeys.size() * minCov));
idealClusterSize = std::min(searchResult.qChainKeys.size(), searchResult.dbChainKeys.size());
minimumClusterSize = std::max(MULTIPLE_CHAINED_COMPLEX, (unsigned int) ((double) searchResult.qChainKeys.size() * minCov));
maximumClusterSize = std::min(searchResult.qChainKeys.size(), searchResult.dbChainKeys.size());
maximumClusterNum = searchResult.alnVec.size() / maximumClusterSize;
prevMaxClusterSize = 0;
maxDist = 0;
eps = DEFAULT_EPS;
maxDist = FLT_MIN;
minDist = FLT_MAX;
learningRate = LEARNING_RATE;
}

Expand All @@ -194,7 +195,7 @@ class DBSCANCluster {
filterAlnsByRBH();
fillDistMap();
// To skip DBSCAN clustering when alignments are few enough.
if (searchResult.alnVec.size() <= idealClusterSize)
if (searchResult.alnVec.size() <= maximumClusterSize)
return checkClusteringNecessity();

return runDBSCAN();
Expand All @@ -204,12 +205,14 @@ class DBSCANCluster {
SearchResult &searchResult;
float eps;
float maxDist;
float minDist;
float learningRate;
unsigned int cLabel;
unsigned int maximumClusterNum;
unsigned int prevMaxClusterSize;
unsigned int maxClusterSize;
unsigned int idealClusterSize;
unsigned int clusterSizeThr;
unsigned int currMaxClusterSize;
unsigned int maximumClusterSize;
unsigned int minimumClusterSize;
std::vector<unsigned int> neighbors;
std::vector<unsigned int> neighborsOfCurrNeighbor;
std::vector<NeighborsWithDist> neighborsWithDist;
Expand Down Expand Up @@ -253,16 +256,16 @@ class DBSCANCluster {
neighbors.emplace_back(neighbor);
}
}
if (neighbors.size() > idealClusterSize || checkChainRedundancy())
if (neighbors.size() > maximumClusterSize || checkChainRedundancy())
getNearestNeighbors(centerAlnIdx);

// too small cluster
if (neighbors.size() < maxClusterSize)
if (neighbors.size() < currMaxClusterSize)
continue;

// new Biggest cluster
if (neighbors.size() > maxClusterSize) {
maxClusterSize = neighbors.size();
if (neighbors.size() >currMaxClusterSize) {
currMaxClusterSize = neighbors.size();
currClusters.clear();
}
SORT_SERIAL(neighbors.begin(), neighbors.end());
Expand All @@ -272,17 +275,20 @@ class DBSCANCluster {
if (!finalClusters.empty() && currClusters.empty())
return finishDBSCAN();

if (maxClusterSize < prevMaxClusterSize)
if (currMaxClusterSize < prevMaxClusterSize)
return finishDBSCAN();

if (maxClusterSize > prevMaxClusterSize) {
if (currMaxClusterSize > prevMaxClusterSize) {
finalClusters.clear();
prevMaxClusterSize = maxClusterSize;
prevMaxClusterSize = currMaxClusterSize;
}

if (maxClusterSize >= clusterSizeThr)
if (currMaxClusterSize >= minimumClusterSize)
finalClusters.insert(currClusters.begin(), currClusters.end());

if (currMaxClusterSize==maximumClusterSize && finalClusters.size() == maximumClusterNum)
return finishDBSCAN();

eps += learningRate;
return runDBSCAN();
}
Expand All @@ -296,9 +302,11 @@ class DBSCANCluster {
ChainToChainAln &currAln = searchResult.alnVec[j];
dist = prevAln.getDistance(currAln);
maxDist = std::max(maxDist, dist);
minDist = std::min(minDist, dist);
distMap.insert({{i,j}, dist});
}
}
eps = minDist;
}

void getNeighbors(size_t centerIdx, std::vector<unsigned int> &neighborVec) {
Expand All @@ -321,7 +329,7 @@ class DBSCANCluster {
aln.label = INITIALIZED_LABEL;
}
cLabel = INITIALIZED_LABEL;
maxClusterSize = 0;
currMaxClusterSize = 0;
currClusters.clear();
}

Expand All @@ -341,7 +349,7 @@ class DBSCANCluster {

bool checkClusteringNecessity() {
// Too few alns => do nothing and finish it
if (searchResult.alnVec.size() < clusterSizeThr)
if (searchResult.alnVec.size() < minimumClusterSize)
return finishDBSCAN();
for (size_t alnIdx=0; alnIdx<searchResult.alnVec.size(); alnIdx++) {
neighbors.emplace_back(alnIdx);
Expand All @@ -367,14 +375,6 @@ class DBSCANCluster {
qFoundChainKeys.clear();
dbFoundChainKeys.clear();
distMap.clear();
// auto it = finalClusters.begin();
// while (it != finalClusters.end()) {
// if (it->size() < clusterSizeThr) {
// it = finalClusters.erase(it);
// continue;
// }
// it++;
// }
return !finalClusters.empty();
}

Expand All @@ -387,17 +387,17 @@ class DBSCANCluster {
qFoundChainKeys.clear();
dbFoundChainKeys.clear();
for (auto qChainKey: searchResult.qChainKeys) {
qBestTmScore.insert({qChainKey, DEF_TM_SCORE});
qBestTmScore.insert({qChainKey, FLT_MIN});
}
for (auto dbChainKey: searchResult.dbChainKeys) {
dbBestTmScore.insert({dbChainKey, DEF_TM_SCORE});
dbBestTmScore.insert({dbChainKey, FLT_MIN});
}
for (auto &aln: searchResult.alnVec) {
qKey = aln.qChain.chainKey;
dbKey = aln.dbChain.chainKey;
tmScore = aln.tmScore;
qBestTmScore[qKey] = qBestTmScore[qKey] < UNINITIALIZED ? tmScore : std::max(tmScore, qBestTmScore[qKey]);
dbBestTmScore[dbKey] = dbBestTmScore[dbKey] < UNINITIALIZED ? tmScore : std::max(tmScore, dbBestTmScore[dbKey]);
qBestTmScore[qKey] = std::max(tmScore, qBestTmScore[qKey]);
dbBestTmScore[dbKey] = std::max(tmScore, dbBestTmScore[dbKey]);
}
size_t alnIdx = 0;
while (alnIdx < searchResult.alnVec.size()) {
Expand All @@ -413,7 +413,7 @@ class DBSCANCluster {
alnIdx ++;
}

if (std::min(qFoundChainKeys.size(), dbFoundChainKeys.size()) < clusterSizeThr)
if (std::min(qFoundChainKeys.size(), dbFoundChainKeys.size()) < minimumClusterSize)
searchResult.alnVec.clear();
}

Expand Down

0 comments on commit 16dc915

Please sign in to comment.