diff --git a/src/strucclustutils/scoremultimer.cpp b/src/strucclustutils/scoremultimer.cpp index ca7c8105..e80c2af8 100644 --- a/src/strucclustutils/scoremultimer.cpp +++ b/src/strucclustutils/scoremultimer.cpp @@ -181,11 +181,12 @@ class DBSCANCluster { public: DBSCANCluster(SearchResult &searchResult, std::set &finalClusters, double minCov) : searchResult(searchResult), finalClusters(finalClusters) { cLabel = 0; - clusterSizeThr = std::max(MULTIPLE_CHAINED_COMPLEX, (unsigned int) ((double) searchResult.qChainKeys.size() * minCov)); - idealClusterSize = std::min(searchResult.qChainKeys.size(), searchResult.dbChainKeys.size()); + minimumClusterSize = std::max(MULTIPLE_CHAINED_COMPLEX, (unsigned int) ((double) searchResult.qChainKeys.size() * minCov)); + maximumClusterSize = std::min(searchResult.qChainKeys.size(), searchResult.dbChainKeys.size()); + maximumClusterNum = searchResult.alnVec.size() / maximumClusterSize; prevMaxClusterSize = 0; - maxDist = 0; - eps = DEFAULT_EPS; + maxDist = FLT_MIN; + minDist = FLT_MAX; learningRate = LEARNING_RATE; } @@ -194,7 +195,7 @@ class DBSCANCluster { filterAlnsByRBH(); fillDistMap(); // To skip DBSCAN clustering when alignments are few enough. - if (searchResult.alnVec.size() <= idealClusterSize) + if (searchResult.alnVec.size() <= maximumClusterSize) return checkClusteringNecessity(); return runDBSCAN(); @@ -204,12 +205,14 @@ class DBSCANCluster { SearchResult &searchResult; float eps; float maxDist; + float minDist; float learningRate; unsigned int cLabel; + unsigned int maximumClusterNum; unsigned int prevMaxClusterSize; - unsigned int maxClusterSize; - unsigned int idealClusterSize; - unsigned int clusterSizeThr; + unsigned int currMaxClusterSize; + unsigned int maximumClusterSize; + unsigned int minimumClusterSize; std::vector neighbors; std::vector neighborsOfCurrNeighbor; std::vector neighborsWithDist; @@ -253,16 +256,16 @@ class DBSCANCluster { neighbors.emplace_back(neighbor); } } - if (neighbors.size() > idealClusterSize || checkChainRedundancy()) + if (neighbors.size() > maximumClusterSize || checkChainRedundancy()) getNearestNeighbors(centerAlnIdx); // too small cluster - if (neighbors.size() < maxClusterSize) + if (neighbors.size() < currMaxClusterSize) continue; // new Biggest cluster - if (neighbors.size() > maxClusterSize) { - maxClusterSize = neighbors.size(); + if (neighbors.size() >currMaxClusterSize) { + currMaxClusterSize = neighbors.size(); currClusters.clear(); } SORT_SERIAL(neighbors.begin(), neighbors.end()); @@ -272,17 +275,20 @@ class DBSCANCluster { if (!finalClusters.empty() && currClusters.empty()) return finishDBSCAN(); - if (maxClusterSize < prevMaxClusterSize) + if (currMaxClusterSize < prevMaxClusterSize) return finishDBSCAN(); - if (maxClusterSize > prevMaxClusterSize) { + if (currMaxClusterSize > prevMaxClusterSize) { finalClusters.clear(); - prevMaxClusterSize = maxClusterSize; + prevMaxClusterSize = currMaxClusterSize; } - if (maxClusterSize >= clusterSizeThr) + if (currMaxClusterSize >= minimumClusterSize) finalClusters.insert(currClusters.begin(), currClusters.end()); + if (currMaxClusterSize==maximumClusterSize && finalClusters.size() == maximumClusterNum) + return finishDBSCAN(); + eps += learningRate; return runDBSCAN(); } @@ -296,9 +302,11 @@ class DBSCANCluster { ChainToChainAln &currAln = searchResult.alnVec[j]; dist = prevAln.getDistance(currAln); maxDist = std::max(maxDist, dist); + minDist = std::min(minDist, dist); distMap.insert({{i,j}, dist}); } } + eps = minDist; } void getNeighbors(size_t centerIdx, std::vector &neighborVec) { @@ -321,7 +329,7 @@ class DBSCANCluster { aln.label = INITIALIZED_LABEL; } cLabel = INITIALIZED_LABEL; - maxClusterSize = 0; + currMaxClusterSize = 0; currClusters.clear(); } @@ -341,7 +349,7 @@ class DBSCANCluster { bool checkClusteringNecessity() { // Too few alns => do nothing and finish it - if (searchResult.alnVec.size() < clusterSizeThr) + if (searchResult.alnVec.size() < minimumClusterSize) return finishDBSCAN(); for (size_t alnIdx=0; alnIdxsize() < clusterSizeThr) { -// it = finalClusters.erase(it); -// continue; -// } -// it++; -// } return !finalClusters.empty(); } @@ -387,17 +387,17 @@ class DBSCANCluster { qFoundChainKeys.clear(); dbFoundChainKeys.clear(); for (auto qChainKey: searchResult.qChainKeys) { - qBestTmScore.insert({qChainKey, DEF_TM_SCORE}); + qBestTmScore.insert({qChainKey, FLT_MIN}); } for (auto dbChainKey: searchResult.dbChainKeys) { - dbBestTmScore.insert({dbChainKey, DEF_TM_SCORE}); + dbBestTmScore.insert({dbChainKey, FLT_MIN}); } for (auto &aln: searchResult.alnVec) { qKey = aln.qChain.chainKey; dbKey = aln.dbChain.chainKey; tmScore = aln.tmScore; - qBestTmScore[qKey] = qBestTmScore[qKey] < UNINITIALIZED ? tmScore : std::max(tmScore, qBestTmScore[qKey]); - dbBestTmScore[dbKey] = dbBestTmScore[dbKey] < UNINITIALIZED ? tmScore : std::max(tmScore, dbBestTmScore[dbKey]); + qBestTmScore[qKey] = std::max(tmScore, qBestTmScore[qKey]); + dbBestTmScore[dbKey] = std::max(tmScore, dbBestTmScore[dbKey]); } size_t alnIdx = 0; while (alnIdx < searchResult.alnVec.size()) { @@ -413,7 +413,7 @@ class DBSCANCluster { alnIdx ++; } - if (std::min(qFoundChainKeys.size(), dbFoundChainKeys.size()) < clusterSizeThr) + if (std::min(qFoundChainKeys.size(), dbFoundChainKeys.size()) < minimumClusterSize) searchResult.alnVec.clear(); }