From bb7ec93bfac9de668d9e7f902be8b1020efa0486 Mon Sep 17 00:00:00 2001 From: Martin Steinegger Date: Tue, 6 Feb 2024 16:48:35 +0900 Subject: [PATCH 001/160] First version for complex filter --- src/FoldseekBase.cpp | 12 +++ src/LocalCommandDeclarations.h | 1 + src/commons/LocalParameters.cpp | 6 ++ src/commons/LocalParameters.h | 1 + src/strucclustutils/CMakeLists.txt | 1 + src/strucclustutils/filtercomplex.cpp | 135 ++++++++++++++++++++++++++ 6 files changed, 156 insertions(+) create mode 100644 src/strucclustutils/filtercomplex.cpp diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 8922f129..c8202e54 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -263,6 +263,18 @@ std::vector foldseekCommands = { {"complexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb} } }, + {"filtercomplex", filtercomplex, &localPar.filtercomplex, COMMAND_HIDDEN, + "Filters complexes", + "foldseek filtercomplex queryDB targetDB alignmentDB complexDB -c 0.8 --cov-mode 1\n", + "Seongeun Kim & Sooyoung Cha ", + " ", + CITATION_FOLDSEEK, { + {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, + {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, + {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, + {"clustDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb } + } + }, {"complexsearch", complexsearch, &localPar.complexsearchworkflow, COMMAND_MAIN, "Complex level search", "# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n" diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index 0787a154..60f2251d 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -21,6 +21,7 @@ extern int structureungappedalign(int argc, const char** argv, const Command &co extern int convert2pdb(int argc, const char** argv, const Command &command); extern int compressca(int argc, const char** argv, const Command &command); extern int scorecomplex(int argc, const char **argv, const Command& command); +extern int filtercomplex(int argc, const char **argv, const Command& command); extern int easycomplexsearch(int argc, const char **argv, const Command &command); extern int createcomplexreport(int argc, const char **argv, const Command &command); extern int expandcomplex(int argc, const char **argv, const Command &command); diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 6ec6c839..420b14d8 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -174,6 +174,12 @@ LocalParameters::LocalParameters() : scorecomplex.push_back(&PARAM_V); scorecomplex.push_back(&PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD); + //filtercomplex + filtercomplex.push_back(&PARAM_V); + filtercomplex.push_back(&PARAM_THREADS); + filtercomplex.push_back(&PARAM_C); + filtercomplex.push_back(&PARAM_COV_MODE); + // createcomplexreport createcomplexreport.push_back(&PARAM_DB_OUTPUT); createcomplexreport.push_back(&PARAM_THREADS); diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index ce2705be..ae0f50a1 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -89,6 +89,7 @@ class LocalParameters : public Parameters { std::vector structurecreatedb; std::vector compressca; std::vector scorecomplex; + std::vector filtercomplex; std::vector complexsearchworkflow; std::vector easyscomplexsearchworkflow; std::vector createcomplexreport; diff --git a/src/strucclustutils/CMakeLists.txt b/src/strucclustutils/CMakeLists.txt index fe3955ae..992f91b7 100644 --- a/src/strucclustutils/CMakeLists.txt +++ b/src/strucclustutils/CMakeLists.txt @@ -14,6 +14,7 @@ set(strucclustutils_source_files strucclustutils/convert2pdb.cpp strucclustutils/compressca.cpp strucclustutils/scorecomplex.cpp + strucclustutils/filtercomplex.cpp strucclustutils/createcomplexreport.cpp strucclustutils/createcomplexreport.h strucclustutils/expandcomplex.cpp diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp new file mode 100644 index 00000000..716a2f27 --- /dev/null +++ b/src/strucclustutils/filtercomplex.cpp @@ -0,0 +1,135 @@ +// +// Created by Martin Steinegger on 2/6/24. +// +#include "Util.h" +#include "LocalParameters.h" +#include "Matcher.h" +#include "Debug.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "IndexReader.h" +#include "FileUtil.h" +#include "TranslateNucl.h" +#include "MemoryMapped.h" +#include "createcomplexreport.h" +#include "LDDT.h" +#include "CalcProbTP.h" +#include +#ifdef OPENMP +#include +#endif + +int filtercomplex(int argc, const char **argv, const Command &command) { + LocalParameters &par = LocalParameters::getLocalInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; + const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); + int dbaccessMode = (DBReader::USE_INDEX); + std::map qKeyToSet; + std::map tKeyToSet; + IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + IndexReader qDbrHeader(par.db1, par.threads, IndexReader::SRC_HEADERS , (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); + IndexReader *tDbrHeader; + if (sameDB) { + tDbrHeader = &qDbrHeader; + } else { + tDbrHeader = new IndexReader(par.db2, par.threads, IndexReader::SRC_HEADERS, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); + } + + DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + alnDbr.open(DBReader::LINEAR_ACCCESS); + + size_t localThreads = 1; +#ifdef OPENMP + localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); +#endif + + const bool shouldCompress = par.dbOut == true && par.compressed == true; + const int dbType = par.dbOut == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE; + DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), 1, shouldCompress, dbType); + resultWriter.open(); + const bool isDb = par.dbOut; + std::string qLookupFile = par.db1 + ".lookup"; + std::string tLookupFile = par.db2 + ".lookup"; + + TranslateNucl translateNucl(static_cast(par.translationTable)); + + Matcher::result_t res; + std::map qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; + std::map> qComplexIdToChainKeyMap, tComplexIdToChainKeyMap; + std::vector qComplexIdVec, tComplexIdVec; + getKeyToIdMapIdToKeysMapIdVec(qLookupFile, qChainKeyToComplexIdMap, qComplexIdToChainKeyMap, qComplexIdVec); + getKeyToIdMapIdToKeysMapIdVec(tLookupFile, tChainKeyToComplexIdMap, tComplexIdToChainKeyMap, tComplexIdVec); + + qChainKeyToComplexIdMap.clear(); + Debug::Progress progress(qComplexIdVec.size()); + + std::vector complexResults; +#pragma omp parallel num_threads(localThreads) + { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); +#endif + Matcher::result_t res; + std::vector localComplexResults; +#pragma omp for schedule(dynamic, 10) nowait + for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { + progress.updateProgress(); + std::vector assIdVec; +// std::vector compAlns; + unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; + std::vector &qChainKeys = qComplexIdToChainKeyMap[qComplexId]; + for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { + unsigned int qChainKey = qChainKeys[qChainIdx]; + unsigned int qChainDbKey = alnDbr.getId(qChainKey); + if (qChainDbKey == NOT_AVAILABLE_CHAIN_KEY) { + continue; + } + +// getComplexNameChainName(queryChainName, qCompAndChainName); + char *data = alnDbr.getData(qChainDbKey, thread_idx); + while (*data != '\0') { + ComplexDataHandler retComplex = parseScoreComplexResult(data, res); + if (!retComplex.isValid){ + Debug(Debug::ERROR) << "No scorecomplex result provided"; + EXIT(EXIT_FAILURE); + } + data = Util::skipLine(data); + unsigned int assId = retComplex.assId; + unsigned int compAlnIdx = std::find(assIdVec.begin(), assIdVec.end(), assId) - assIdVec.begin(); +// if (compAlnIdx == compAlns.size()) { +// assIdVec.emplace_back(assId); +// compAlns.emplace_back(queryChainName, targetChainName, retComplex.qTmScore, retComplex.tTmScore, retComplex.uString, retComplex.tString, assId); +// } else { +// compAlns[compAlnIdx].qChainNames.emplace_back(queryChainName); +// compAlns[compAlnIdx].tChainNames.emplace_back(targetChainName); +// } + } // while end + } + // CHECK CRIETERIA HERE + + // WRITE RESULT here + std::string result;\ + // WRITE TARGET COMPLEX IDS FOR COMPLEX THAT FULLFIL THE CRITERIA + result.push_back("1\n"); + resultWriter.writeData(result.c_str(), result.length(), qComplexId, 0, isDb, isDb); + +// for (size_t compAlnIdx = 0; compAlnIdx < compAlns.size(); compAlnIdx++) { +// const ComplexAlignment &aln = compAlns[compAlnIdx]; +// getScoreComplexResults(localComplexResults, aln.qChainNames, aln.tChainNames, aln.qTMScore, aln.tTMScore, aln.u, aln.t, aln.assId); +// } + } // for end + + } // MP end + + resultWriter.close(true); + if (isDb == false) { + FileUtil::remove(par.db4Index.c_str()); + } + alnDbr.close(); + if (sameDB == false) { + delete tDbrHeader; + } + return EXIT_SUCCESS; +} \ No newline at end of file From 03860d19d818f8ed317eb101316ffea447e10ecd Mon Sep 17 00:00:00 2001 From: sooyoung Date: Thu, 15 Feb 2024 15:07:12 +0900 Subject: [PATCH 002/160] has error, but for sharing status. Coverge criteria --- src/strucclustutils/filtercomplex.cpp | 118 ++++++++++++++++---------- 1 file changed, 75 insertions(+), 43 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 716a2f27..e1501c69 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -1,12 +1,9 @@ -// -// Created by Martin Steinegger on 2/6/24. -// +#include "DBWriter.h" #include "Util.h" #include "LocalParameters.h" #include "Matcher.h" #include "Debug.h" #include "DBReader.h" -#include "DBWriter.h" #include "IndexReader.h" #include "FileUtil.h" #include "TranslateNucl.h" @@ -15,10 +12,39 @@ #include "LDDT.h" #include "CalcProbTP.h" #include + #ifdef OPENMP #include #endif - +/* +bool checkFilterCriteria(Matcher::result_t &ressum, double seqIdThr, int alnLenThr, int covMode, float covThr) { + const bool seqIdOK = (ressum.seqId >= seqIdThr); + const bool covOK = Util::hasCoverage(covThr, covMode, ressum.qcov, ressum.dbcov); + const bool alnLenOK = Util::hasAlignmentLength(alnLenThr, ressum.alnLength); + //const bool tmOK = (re.) + if + // general accaptance criteria + ( + seqIdOK && + covOK && + alnLenOK + ) { + return true; + } else { + return false; + } +} +*/ +bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr) { + const bool covOK = Util::hasCoverage(covThr, covMode, qcov, dbcov); + if ( + covOK + ) { + return true; + } else { + return false; + } +} int filtercomplex(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.parseParameters(argc, argv, command, true, 0, 0); @@ -28,22 +54,13 @@ int filtercomplex(int argc, const char **argv, const Command &command) { std::map qKeyToSet; std::map tKeyToSet; IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); - IndexReader qDbrHeader(par.db1, par.threads, IndexReader::SRC_HEADERS , (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); - IndexReader *tDbrHeader; - if (sameDB) { - tDbrHeader = &qDbrHeader; - } else { - tDbrHeader = new IndexReader(par.db2, par.threads, IndexReader::SRC_HEADERS, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); - } - + IndexReader tDbr(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); alnDbr.open(DBReader::LINEAR_ACCCESS); - size_t localThreads = 1; #ifdef OPENMP localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); #endif - const bool shouldCompress = par.dbOut == true && par.compressed == true; const int dbType = par.dbOut == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE; DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), 1, shouldCompress, dbType); @@ -51,19 +68,15 @@ int filtercomplex(int argc, const char **argv, const Command &command) { const bool isDb = par.dbOut; std::string qLookupFile = par.db1 + ".lookup"; std::string tLookupFile = par.db2 + ".lookup"; - TranslateNucl translateNucl(static_cast(par.translationTable)); - Matcher::result_t res; std::map qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; std::map> qComplexIdToChainKeyMap, tComplexIdToChainKeyMap; std::vector qComplexIdVec, tComplexIdVec; getKeyToIdMapIdToKeysMapIdVec(qLookupFile, qChainKeyToComplexIdMap, qComplexIdToChainKeyMap, qComplexIdVec); getKeyToIdMapIdToKeysMapIdVec(tLookupFile, tChainKeyToComplexIdMap, tComplexIdToChainKeyMap, tComplexIdVec); - qChainKeyToComplexIdMap.clear(); Debug::Progress progress(qComplexIdVec.size()); - std::vector complexResults; #pragma omp parallel num_threads(localThreads) { @@ -74,20 +87,43 @@ int filtercomplex(int argc, const char **argv, const Command &command) { Matcher::result_t res; std::vector localComplexResults; #pragma omp for schedule(dynamic, 10) nowait + std::map tComplexLength; + for (size_t tComplexIdx = 0; tComplexIdx < tComplexIdVec.size(); tComplexIdx++) { + unsigned int tComplexId = tComplexIdVec[tComplexIdx]; + std::vector &tChainKeys = tComplexIdToChainKeyMap[tComplexId]; + unsigned int tSeqLen = 0; + for (size_t tChainIdx = 0; tChainIdx < tChainKeys.size(); tChainIdx++ ) { + unsigned int tChainKey = tChainKeys[tChainIdx]; + unsigned int tChainDbKey = alnDbr.getId(tChainKey); + if (tChainDbKey == NOT_AVAILABLE_CHAIN_KEY) { + continue; + } + const char *entry[255]; + size_t columns = Util::getWordsOfLine(tDbr.sequenceReader->getDataByDBKey(tChainKey, thread_idx), entry, 255); + unsigned int curSeqLen = Util::fast_atoi(entry[2]); + tSeqLen += curSeqLen; + } + tComplexLength[tComplexId] = tSeqLen; + } + tComplexIdToChainKeyMap.clear() for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { progress.updateProgress(); std::vector assIdVec; -// std::vector compAlns; unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; std::vector &qChainKeys = qComplexIdToChainKeyMap[qComplexId]; + std::map covSum; + unsigned int qSeqLen = 0; + std::map assIdTodbKey; for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { unsigned int qChainKey = qChainKeys[qChainIdx]; unsigned int qChainDbKey = alnDbr.getId(qChainKey); if (qChainDbKey == NOT_AVAILABLE_CHAIN_KEY) { continue; } - -// getComplexNameChainName(queryChainName, qCompAndChainName); + const char *entry[255]; + size_t columns = Util::getWordsOfLine(qDbr.sequenceReader->getDataByDBKey(qChainKey, thread_idx), entry, 255); + unsigned int curSeqLen = Util::fast_atoi(entry[2]); + qSeqLen += curSeqLen; char *data = alnDbr.getData(qChainDbKey, thread_idx); while (*data != '\0') { ComplexDataHandler retComplex = parseScoreComplexResult(data, res); @@ -97,39 +133,35 @@ int filtercomplex(int argc, const char **argv, const Command &command) { } data = Util::skipLine(data); unsigned int assId = retComplex.assId; - unsigned int compAlnIdx = std::find(assIdVec.begin(), assIdVec.end(), assId) - assIdVec.begin(); -// if (compAlnIdx == compAlns.size()) { -// assIdVec.emplace_back(assId); -// compAlns.emplace_back(queryChainName, targetChainName, retComplex.qTmScore, retComplex.tTmScore, retComplex.uString, retComplex.tString, assId); -// } else { -// compAlns[compAlnIdx].qChainNames.emplace_back(queryChainName); -// compAlns[compAlnIdx].tChainNames.emplace_back(targetChainName); -// } +// unsigned int compAlnIdx = std::find(assIdVec.begin(), assIdVec.end(), assId) - assIdVec.begin(); + if (covSum.find(assId) == covSum.end()) { + covSum[assId] = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + assIdTodbKey.emplace(assId, res.dbKey); + } + else{ + covSum[assId] += (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + } } // while end } + // assID : alnLen + std::string result; + for (const auto& pair : covSum){ + float qcov = static_cast(pair.second) / static_cast(qSeqLen); + float dbcov = static_cast(pair.second) / static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]]); + if (checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr)){ + result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]])+ "\n" ; + } + } // CHECK CRIETERIA HERE - // WRITE RESULT here - std::string result;\ // WRITE TARGET COMPLEX IDS FOR COMPLEX THAT FULLFIL THE CRITERIA - result.push_back("1\n"); resultWriter.writeData(result.c_str(), result.length(), qComplexId, 0, isDb, isDb); - -// for (size_t compAlnIdx = 0; compAlnIdx < compAlns.size(); compAlnIdx++) { -// const ComplexAlignment &aln = compAlns[compAlnIdx]; -// getScoreComplexResults(localComplexResults, aln.qChainNames, aln.tChainNames, aln.qTMScore, aln.tTMScore, aln.u, aln.t, aln.assId); -// } } // for end - } // MP end - resultWriter.close(true); if (isDb == false) { FileUtil::remove(par.db4Index.c_str()); } alnDbr.close(); - if (sameDB == false) { - delete tDbrHeader; - } return EXIT_SUCCESS; } \ No newline at end of file From 5adeb999339dddf70db26e9c6cb8cfc98b218ee5 Mon Sep 17 00:00:00 2001 From: sooyoung Date: Thu, 15 Feb 2024 16:01:21 +0900 Subject: [PATCH 003/160] no errors, not debugged yet --- src/strucclustutils/filtercomplex.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index e1501c69..6922f1a4 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -86,8 +86,8 @@ int filtercomplex(int argc, const char **argv, const Command &command) { #endif Matcher::result_t res; std::vector localComplexResults; -#pragma omp for schedule(dynamic, 10) nowait std::map tComplexLength; +#pragma omp for schedule(dynamic, 10) nowait for (size_t tComplexIdx = 0; tComplexIdx < tComplexIdVec.size(); tComplexIdx++) { unsigned int tComplexId = tComplexIdVec[tComplexIdx]; std::vector &tChainKeys = tComplexIdToChainKeyMap[tComplexId]; @@ -103,9 +103,10 @@ int filtercomplex(int argc, const char **argv, const Command &command) { unsigned int curSeqLen = Util::fast_atoi(entry[2]); tSeqLen += curSeqLen; } - tComplexLength[tComplexId] = tSeqLen; + tComplexLength.emplace(tComplexId, tSeqLen); } - tComplexIdToChainKeyMap.clear() + tComplexIdToChainKeyMap.clear(); + for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { progress.updateProgress(); std::vector assIdVec; From d81811f9640dfaaa36c8f0477e0f7c46ab6a6dda Mon Sep 17 00:00:00 2001 From: sooyoung Date: Fri, 16 Feb 2024 19:00:18 +0900 Subject: [PATCH 004/160] TODO: select highest aligned alignments among same complex-complex & what if user wants to use -c 0.0? --- src/strucclustutils/filtercomplex.cpp | 92 +++++++++++++++------------ 1 file changed, 53 insertions(+), 39 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 6922f1a4..0b119382 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -37,6 +37,7 @@ bool checkFilterCriteria(Matcher::result_t &ressum, double seqIdThr, int alnLenT */ bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr) { const bool covOK = Util::hasCoverage(covThr, covMode, qcov, dbcov); + std::cout< &qChainKeys) { + unsigned int qResidueLen = 0; + for (auto qChainKey: qChainKeys) { + size_t id = qDbr.sequenceReader->getId(qChainKey); + // Not accessible + if (id == NOT_AVAILABLE_CHAIN_KEY) + return 0; + qResidueLen += qDbr.sequenceReader->getSeqLen(id); + } + return qResidueLen; +} + int filtercomplex(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.parseParameters(argc, argv, command, true, 0, 0); + if (par.covThr == false){ + par.covThr = 0.8; + } const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); int dbaccessMode = (DBReader::USE_INDEX); std::map qKeyToSet; std::map tKeyToSet; + IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); IndexReader tDbr(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); alnDbr.open(DBReader::LINEAR_ACCCESS); size_t localThreads = 1; + #ifdef OPENMP - localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); + //localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); #endif const bool shouldCompress = par.dbOut == true && par.compressed == true; const int dbType = par.dbOut == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE; @@ -78,55 +96,50 @@ int filtercomplex(int argc, const char **argv, const Command &command) { qChainKeyToComplexIdMap.clear(); Debug::Progress progress(qComplexIdVec.size()); std::vector complexResults; + std::map tComplexLength; + std::map qComplexLength; + #pragma omp parallel num_threads(localThreads) { unsigned int thread_idx = 0; #ifdef OPENMP thread_idx = static_cast(omp_get_thread_num()); #endif - Matcher::result_t res; - std::vector localComplexResults; - std::map tComplexLength; + Matcher::result_t res; + std::vector localComplexResults; #pragma omp for schedule(dynamic, 10) nowait for (size_t tComplexIdx = 0; tComplexIdx < tComplexIdVec.size(); tComplexIdx++) { unsigned int tComplexId = tComplexIdVec[tComplexIdx]; std::vector &tChainKeys = tComplexIdToChainKeyMap[tComplexId]; - unsigned int tSeqLen = 0; - for (size_t tChainIdx = 0; tChainIdx < tChainKeys.size(); tChainIdx++ ) { - unsigned int tChainKey = tChainKeys[tChainIdx]; - unsigned int tChainDbKey = alnDbr.getId(tChainKey); - if (tChainDbKey == NOT_AVAILABLE_CHAIN_KEY) { - continue; - } - const char *entry[255]; - size_t columns = Util::getWordsOfLine(tDbr.sequenceReader->getDataByDBKey(tChainKey, thread_idx), entry, 255); - unsigned int curSeqLen = Util::fast_atoi(entry[2]); - tSeqLen += curSeqLen; + if (tChainKeys.empty()) { + continue; } - tComplexLength.emplace(tComplexId, tSeqLen); + unsigned int reslen = getQueryResidueLength(tDbr, tChainKeys); + tComplexLength[tComplexId] =reslen; } - tComplexIdToChainKeyMap.clear(); - - for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { - progress.updateProgress(); - std::vector assIdVec; - unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; + for (size_t qComplexIdx = 0; qComplexIdx < qComplexIdVec.size(); qComplexIdx++) { + unsigned int qComplexId = qComplexIdVec[qComplexIdx]; std::vector &qChainKeys = qComplexIdToChainKeyMap[qComplexId]; + if (qChainKeys.empty()) { + continue; + } + unsigned int reslen = getQueryResidueLength(qDbr, qChainKeys); + qComplexLength[qComplexId] = reslen; + } + for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { + //progress.updateProgress(); std::map covSum; - unsigned int qSeqLen = 0; + unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; std::map assIdTodbKey; + std::vector &qChainKeys = qComplexIdToChainKeyMap[qComplexId]; for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { unsigned int qChainKey = qChainKeys[qChainIdx]; unsigned int qChainDbKey = alnDbr.getId(qChainKey); if (qChainDbKey == NOT_AVAILABLE_CHAIN_KEY) { continue; } - const char *entry[255]; - size_t columns = Util::getWordsOfLine(qDbr.sequenceReader->getDataByDBKey(qChainKey, thread_idx), entry, 255); - unsigned int curSeqLen = Util::fast_atoi(entry[2]); - qSeqLen += curSeqLen; char *data = alnDbr.getData(qChainDbKey, thread_idx); - while (*data != '\0') { + while (*data) { ComplexDataHandler retComplex = parseScoreComplexResult(data, res); if (!retComplex.isValid){ Debug(Debug::ERROR) << "No scorecomplex result provided"; @@ -134,35 +147,36 @@ int filtercomplex(int argc, const char **argv, const Command &command) { } data = Util::skipLine(data); unsigned int assId = retComplex.assId; -// unsigned int compAlnIdx = std::find(assIdVec.begin(), assIdVec.end(), assId) - assIdVec.begin(); if (covSum.find(assId) == covSum.end()) { covSum[assId] = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); assIdTodbKey.emplace(assId, res.dbKey); } else{ - covSum[assId] += (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + covSum[assId] += (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); } - } // while end + } } - // assID : alnLen std::string result; for (const auto& pair : covSum){ - float qcov = static_cast(pair.second) / static_cast(qSeqLen); + float qcov = static_cast(pair.second) / static_cast(qComplexLength[qComplexId]); float dbcov = static_cast(pair.second) / static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]]); if (checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr)){ - result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]])+ "\n" ; + //result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(qcov)+ "\t" + std::to_string(dbcov)+ "\n" ; + result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\n" ; } } - // CHECK CRIETERIA HERE - // WRITE RESULT here - // WRITE TARGET COMPLEX IDS FOR COMPLEX THAT FULLFIL THE CRITERIA resultWriter.writeData(result.c_str(), result.length(), qComplexId, 0, isDb, isDb); - } // for end - } // MP end + } + } resultWriter.close(true); if (isDb == false) { FileUtil::remove(par.db4Index.c_str()); } alnDbr.close(); + /* + if (alnDbr != NULL) { + delete alnDbr; + } + */ return EXIT_SUCCESS; } \ No newline at end of file From b7bc37fcf5835966d3237d75a0266f69695ead72 Mon Sep 17 00:00:00 2001 From: sooyoung Date: Mon, 19 Feb 2024 12:28:28 +0900 Subject: [PATCH 005/160] -c default 0.8 --- src/strucclustutils/filtercomplex.cpp | 128 +++++++++++++++++++------- 1 file changed, 93 insertions(+), 35 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 0b119382..23228b05 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -16,28 +16,9 @@ #ifdef OPENMP #include #endif -/* -bool checkFilterCriteria(Matcher::result_t &ressum, double seqIdThr, int alnLenThr, int covMode, float covThr) { - const bool seqIdOK = (ressum.seqId >= seqIdThr); - const bool covOK = Util::hasCoverage(covThr, covMode, ressum.qcov, ressum.dbcov); - const bool alnLenOK = Util::hasAlignmentLength(alnLenThr, ressum.alnLength); - //const bool tmOK = (re.) - if - // general accaptance criteria - ( - seqIdOK && - covOK && - alnLenOK - ) { - return true; - } else { - return false; - } -} -*/ + bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr) { const bool covOK = Util::hasCoverage(covThr, covMode, qcov, dbcov); - std::cout< &qChainKeys) { +unsigned int getQueryResidueLength( IndexReader& qDbr, std::vector &qChainKeys) { unsigned int qResidueLen = 0; for (auto qChainKey: qChainKeys) { size_t id = qDbr.sequenceReader->getId(qChainKey); @@ -57,6 +38,24 @@ unsigned int getQueryResidueLength( IndexReader &qDbr, std::vector } return qResidueLen; } +unsigned int getTargetResidueLength( IndexReader *qDbr, std::vector &qChainKeys) { + unsigned int qResidueLen = 0; + for (auto qChainKey: qChainKeys) { + size_t id = qDbr->sequenceReader->getId(qChainKey); + // Not accessible + if (id == NOT_AVAILABLE_CHAIN_KEY) + return 0; + qResidueLen += qDbr->sequenceReader->getSeqLen(id); + } + return qResidueLen; +} +std::vector selecHighestCoverage( std::map> &covMap){ + std::vector assIdvec; + for (auto pair : covMap){ + assIdvec.push_back(pair.second.rbegin()->second); + } + return assIdvec; +} int filtercomplex(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); @@ -71,7 +70,13 @@ int filtercomplex(int argc, const char **argv, const Command &command) { std::map tKeyToSet; IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); - IndexReader tDbr(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + IndexReader* tDbr; + if (sameDB) { + tDbr = &qDbr; + } + else{ + tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + } DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); alnDbr.open(DBReader::LINEAR_ACCCESS); size_t localThreads = 1; @@ -114,7 +119,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { if (tChainKeys.empty()) { continue; } - unsigned int reslen = getQueryResidueLength(tDbr, tChainKeys); + unsigned int reslen = getTargetResidueLength(tDbr, tChainKeys); tComplexLength[tComplexId] =reslen; } for (size_t qComplexIdx = 0; qComplexIdx < qComplexIdVec.size(); qComplexIdx++) { @@ -128,7 +133,8 @@ int filtercomplex(int argc, const char **argv, const Command &command) { } for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { //progress.updateProgress(); - std::map covSum; + std::map qcovSum; + std::map tcovSum; unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; std::map assIdTodbKey; std::vector &qChainKeys = qComplexIdToChainKeyMap[qComplexId]; @@ -147,22 +153,76 @@ int filtercomplex(int argc, const char **argv, const Command &command) { } data = Util::skipLine(data); unsigned int assId = retComplex.assId; - if (covSum.find(assId) == covSum.end()) { - covSum[assId] = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + if (qcovSum.find(assId) == qcovSum.end()) { + qcovSum[assId] = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + assIdTodbKey.emplace(assId, res.dbKey); + } + else{ + qcovSum[assId] += (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + } + if (tcovSum.find(assId) == tcovSum.end()) { + tcovSum[assId] = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); assIdTodbKey.emplace(assId, res.dbKey); } else{ - covSum[assId] += (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + tcovSum[assId] += (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); } } } std::string result; - for (const auto& pair : covSum){ + std::vector keysToDelete; + for (const auto& pair : qcovSum){ float qcov = static_cast(pair.second) / static_cast(qComplexLength[qComplexId]); - float dbcov = static_cast(pair.second) / static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]]); + float dbcov = static_cast(tcovSum[pair.first]) / static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]]); + + /* if (checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr)){ - //result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(qcov)+ "\t" + std::to_string(dbcov)+ "\n" ; - result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\n" ; + result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(qcov)+ "\t" + std::to_string(dbcov)+ "\t" + std::to_string(pair.first)+ "\n" ; + + }*/ + if (!checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr)){ + keysToDelete.push_back(pair.first); + } + + + } + for (const auto& key : keysToDelete) { + qcovSum.erase(key); + tcovSum.erase(key); + } + + std::map> qcompIdToassIdToalnSum, tcompIdToassIdToalnSum, avgcompIdToassIdToalnSum; + for (const auto& pair : qcovSum){ + if (qcompIdToassIdToalnSum.find(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) == qcompIdToassIdToalnSum.end()){ + qcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] = {{pair.second, pair.first}}; + tcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] ={{ tcovSum[pair.first], pair.first}}; + avgcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] = {{(pair.second+tcovSum[pair.first])/2, pair.first}}; + } + else{ + qcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]][pair.second] = pair.first; + tcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]][tcovSum[pair.first]] = pair.first; + avgcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]][(pair.second + tcovSum[pair.first])/2] = pair.first ; + } + } + std::vector selectedAssIDs; + switch (par.covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + selectedAssIDs = selecHighestCoverage(avgcompIdToassIdToalnSum); + break; + case Parameters::COV_MODE_TARGET: + selectedAssIDs = selecHighestCoverage(tcompIdToassIdToalnSum); + break; + case Parameters::COV_MODE_QUERY: + selectedAssIDs = selecHighestCoverage(qcompIdToassIdToalnSum); + break; + } + + for (const auto& pair : qcovSum){ + if (std::find(selectedAssIDs.begin(), selectedAssIDs.end(), pair.first) != selectedAssIDs.end()){ + float qcov = static_cast(pair.second) / static_cast(qComplexLength[qComplexId]); + float dbcov = static_cast(tcovSum[pair.first]) / static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]]); + result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(qcov)+ "\t" + std::to_string(dbcov)+ "\t" + std::to_string(pair.first)+ "\n" ; + //result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\n" ; } } resultWriter.writeData(result.c_str(), result.length(), qComplexId, 0, isDb, isDb); @@ -173,10 +233,8 @@ int filtercomplex(int argc, const char **argv, const Command &command) { FileUtil::remove(par.db4Index.c_str()); } alnDbr.close(); - /* - if (alnDbr != NULL) { - delete alnDbr; + if (sameDB == false) { + delete tDbr; } - */ return EXIT_SUCCESS; } \ No newline at end of file From e52c527aa481e16879b67cd40735135f86c23dd3 Mon Sep 17 00:00:00 2001 From: sooyoung Date: Mon, 19 Feb 2024 12:52:03 +0900 Subject: [PATCH 006/160] cleaned code --- src/strucclustutils/filtercomplex.cpp | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 23228b05..db02334a 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -69,10 +69,11 @@ int filtercomplex(int argc, const char **argv, const Command &command) { std::map qKeyToSet; std::map tKeyToSet; - IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + IndexReader* qDbr; + qDbr = new IndexReader(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); IndexReader* tDbr; if (sameDB) { - tDbr = &qDbr; + tDbr = qDbr; } else{ tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); @@ -128,11 +129,10 @@ int filtercomplex(int argc, const char **argv, const Command &command) { if (qChainKeys.empty()) { continue; } - unsigned int reslen = getQueryResidueLength(qDbr, qChainKeys); + unsigned int reslen = getTargetResidueLength(qDbr, qChainKeys); qComplexLength[qComplexId] = reslen; } for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { - //progress.updateProgress(); std::map qcovSum; std::map tcovSum; unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; @@ -174,12 +174,6 @@ int filtercomplex(int argc, const char **argv, const Command &command) { for (const auto& pair : qcovSum){ float qcov = static_cast(pair.second) / static_cast(qComplexLength[qComplexId]); float dbcov = static_cast(tcovSum[pair.first]) / static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]]); - - /* - if (checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr)){ - result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(qcov)+ "\t" + std::to_string(dbcov)+ "\t" + std::to_string(pair.first)+ "\n" ; - - }*/ if (!checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr)){ keysToDelete.push_back(pair.first); } @@ -219,10 +213,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { for (const auto& pair : qcovSum){ if (std::find(selectedAssIDs.begin(), selectedAssIDs.end(), pair.first) != selectedAssIDs.end()){ - float qcov = static_cast(pair.second) / static_cast(qComplexLength[qComplexId]); - float dbcov = static_cast(tcovSum[pair.first]) / static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]]); - result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(qcov)+ "\t" + std::to_string(dbcov)+ "\t" + std::to_string(pair.first)+ "\n" ; - //result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\n" ; + result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]])+ "\n" ; } } resultWriter.writeData(result.c_str(), result.length(), qComplexId, 0, isDb, isDb); @@ -233,6 +224,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { FileUtil::remove(par.db4Index.c_str()); } alnDbr.close(); + delete qDbr; if (sameDB == false) { delete tDbr; } From aaf1a6b12d664041e343afb7e927c2294bd49055 Mon Sep 17 00:00:00 2001 From: sooyoung Date: Tue, 20 Feb 2024 16:46:37 +0900 Subject: [PATCH 007/160] complexclust.sh --- data/complexclust.sh | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 data/complexclust.sh diff --git a/data/complexclust.sh b/data/complexclust.sh new file mode 100644 index 00000000..899f9d73 --- /dev/null +++ b/data/complexclust.sh @@ -0,0 +1,32 @@ +#!/bin/sh -e +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + +exists() { + [ -f "$1" ] +} + +# check number of input variables +[ "$#" -ne 4 ] && echo "Please provide " && exit 1; +# check if files exist +[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; +[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1; +[ -f "$3.dbtype" ] && echo "$3.dbtype exists already!" && exit 1; +[ ! -d "$4" ] && echo "tmp directory $4 not found!" && mkdir -p "$4"; + +INPUTSEQ="$1" +INPUTCLUST=="$2" +OUTCLUST="$3" +TMP_PATH="$4" + +COMPLEXINPUT="${TMP_PATH}/${INPUTSEQ}_com" + + +"$MMSEQS" clust "$COMPLEXINPUT" "$INPUTCLUST" "$OUTCLUST" ${CLUSTER_PAR} + From 5ac175fd50c604121795b1ffc07780a020bffe4e Mon Sep 17 00:00:00 2001 From: sooyoung Date: Tue, 20 Feb 2024 17:44:32 +0900 Subject: [PATCH 008/160] erased default -c 0.8 --- src/strucclustutils/filtercomplex.cpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index db02334a..a776f8c2 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -60,14 +60,17 @@ std::vector selecHighestCoverage( std::map::USE_INDEX); std::map qKeyToSet; std::map tKeyToSet; + char buffer[32]; IndexReader* qDbr; qDbr = new IndexReader(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); @@ -85,11 +88,12 @@ int filtercomplex(int argc, const char **argv, const Command &command) { #ifdef OPENMP //localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); #endif - const bool shouldCompress = par.dbOut == true && par.compressed == true; - const int dbType = par.dbOut == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE; - DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), 1, shouldCompress, dbType); + const bool shouldCompress = (par.compressed == true); + const int db4Type = Parameters::DBTYPE_CLUSTER_RES; + + DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), 1, shouldCompress, db4Type); resultWriter.open(); - const bool isDb = par.dbOut; + std::string qLookupFile = par.db1 + ".lookup"; std::string tLookupFile = par.db2 + ".lookup"; TranslateNucl translateNucl(static_cast(par.translationTable)); @@ -132,6 +136,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { unsigned int reslen = getTargetResidueLength(qDbr, qChainKeys); qComplexLength[qComplexId] = reslen; } + for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { std::map qcovSum; std::map tcovSum; @@ -176,9 +181,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { float dbcov = static_cast(tcovSum[pair.first]) / static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]]); if (!checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr)){ keysToDelete.push_back(pair.first); - } - - + } } for (const auto& key : keysToDelete) { qcovSum.erase(key); @@ -210,19 +213,17 @@ int filtercomplex(int argc, const char **argv, const Command &command) { selectedAssIDs = selecHighestCoverage(qcompIdToassIdToalnSum); break; } - for (const auto& pair : qcovSum){ if (std::find(selectedAssIDs.begin(), selectedAssIDs.end(), pair.first) != selectedAssIDs.end()){ - result += std::to_string(qComplexId)+ "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]])+ "\n" ; + char *outpos = Itoa::u32toa_sse2(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]], buffer); + result.append(buffer, (outpos - buffer - 1)); + result.push_back('\n'); } } - resultWriter.writeData(result.c_str(), result.length(), qComplexId, 0, isDb, isDb); + resultWriter.writeData(result.c_str(), result.length(), qComplexId); } } resultWriter.close(true); - if (isDb == false) { - FileUtil::remove(par.db4Index.c_str()); - } alnDbr.close(); delete qDbr; if (sameDB == false) { From ef00e78579e451d9189e6e95644e551acb9f52e8 Mon Sep 17 00:00:00 2001 From: rachelse Date: Wed, 21 Feb 2024 00:41:46 +0900 Subject: [PATCH 009/160] [IN PROGRESS] Draft state complexcluster.sh --- data/complexcluster.sh | 106 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 data/complexcluster.sh diff --git a/data/complexcluster.sh b/data/complexcluster.sh new file mode 100644 index 00000000..91e471fc --- /dev/null +++ b/data/complexcluster.sh @@ -0,0 +1,106 @@ +#!/bin/sh -e +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + +exists() { + [ -f "$1" ] +} + +# check number of input variables +[ "$#" -ne 3 ] && echo "Please provide " && exit 1; +# check if files exist +[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; +[ -f "$2.dbtype" ] && echo "$2.dbtype exists already!" && exit 1; +[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; + +INPUT="$1" +TMP_PATH="$3" +SOURCE="$INPUT" + +# DOING : createdb +if notExists "${INPUT}.dbtype"; then + if notExists "${TMP_PATH}/query"; then + # shellcheck disable=SC2086 + "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/input" ${CREATEDB_PAR} \ + || fail "input createdb died" + fi +fi + +# DOING : search +if notExists "${TMP_PATH}/result.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" search "${INPUT}" "${INPUT}" "${TMP_PATH}/result" "${TMP_PATH}/search_tmp" ${SEARCH_PAR} \ + || fail "Search died" +fi +COMPDB="${TMP_PATH}/result" + +# FIX : expandcomplex ? +if [ "$PREFMODE" != "EXHAUSTIVE" ]; then + if notExists "${TMP_PATH}/result_expand_pref.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" expandcomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/result" "${TMP_PATH}/result_expand_pref" ${THREADS_PAR} \ + || fail "Expandcomplex died" + fi + if notExists "${TMP_PATH}/result_expand_aligned.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" $COMPLEX_ALIGNMENT_ALGO "${INPUT}" "${INPUT}" "${TMP_PATH}/result_expand_pref" "${TMP_PATH}/result_expand_aligned" ${COMPLEX_ALIGN_PAR} \ + || fail $COMPLEX_ALIGNMENT_ALGO "died" + fi + COMPDB="${TMP_PATH}/result_expand_aligned" +fi +# DOING : scorecomplex +if notExists "${TMP_PATH}/result_complex.dbtype"; then + # shellcheck disable=SC2086 + $MMSEQS scorecomplex "${INPUT}" "${INPUT}" "${COMPTDB}" "${TMP_PATH}/result_complex" ${SCORECOMPLEX_PAR} \ + || fail "ScoreComplex died" +fi + +# DOING : filtercomplex +if notExists "${TMP_PATH}/complex_filt"; then + # shellcheck disable=SC2086 + $MMSEQS filtercomplex "${INPUT}" "${INPUT}" "${COMPDB}" "${TMP_PATH}/result_cmplfilt" ${FILTERCOMPLEX_PAR} \ + || fail "FilterComplex died" +fi + +# FIXME : twickDB w/ awk -> db also need to be changed? +INPUT="${TMP_PATH}/cmpl_db" +awk -F"\t" ' + BEGIN {OFFSET=0} + NR==FNR {chain_len[$1]=$3;next} + { + if !($3 in off_arr) { + off_arr[$3]=OFFSET + } + cmpl_len[$3]=chain_len[$1];OFFSET+=chain_len[$1] + } + END { + for (cmpl in off_arr) { + print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] + } +}' "${SOURCE}.index" "${SOURCE}.lookup" > "${TMP_PATH}/cmpl_db.index" + + +# FIXME : clust +if notExists "${TMP_PATH}/clu.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" clust "${INPUT}" "${TMP_PATH}/result_cmplfilt" "$2" ${CLUSTER_PAR} \ + || fail "Clustering died" +fi + +# DOING : remove tmp +if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/result" ${VERBOSITY} + if [ "$PREFMODE" != "EXHAUSTIVE" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/result_expand_aligned" ${VERBOSITY} + fi + rm -rf "${TMP_PATH}/search_tmp" + rm -f "${TMP_PATH}/complexcluster.sh" +fi \ No newline at end of file From b5c45c3777d67f865f6dcc5bbdd5e25b6ee7ad8b Mon Sep 17 00:00:00 2001 From: rachelse Date: Wed, 21 Feb 2024 00:58:34 +0900 Subject: [PATCH 010/160] minor modification --- src/strucclustutils/filtercomplex.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index db02334a..679f7ae1 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -27,6 +27,7 @@ bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr) { return false; } } + unsigned int getQueryResidueLength( IndexReader& qDbr, std::vector &qChainKeys) { unsigned int qResidueLen = 0; for (auto qChainKey: qChainKeys) { @@ -38,6 +39,7 @@ unsigned int getQueryResidueLength( IndexReader& qDbr, std::vector } return qResidueLen; } + unsigned int getTargetResidueLength( IndexReader *qDbr, std::vector &qChainKeys) { unsigned int qResidueLen = 0; for (auto qChainKey: qChainKeys) { @@ -49,6 +51,7 @@ unsigned int getTargetResidueLength( IndexReader *qDbr, std::vector selecHighestCoverage( std::map> &covMap){ std::vector assIdvec; for (auto pair : covMap){ From 51d29b8c6aa2c0b10ff473db6743c417c95d6f2e Mon Sep 17 00:00:00 2001 From: rachelse Date: Wed, 21 Feb 2024 14:50:20 +0900 Subject: [PATCH 011/160] Changed complexcluster.sh to easycomplexcluster.sh --- data/complexclust.sh | 32 ----- ...omplexcluster.sh => easycomplexcluster.sh} | 56 +++++--- src/workflow/EasyComplexCluster.cpp | 121 ++++++++++++++++++ 3 files changed, 161 insertions(+), 48 deletions(-) delete mode 100644 data/complexclust.sh rename data/{complexcluster.sh => easycomplexcluster.sh} (72%) create mode 100644 src/workflow/EasyComplexCluster.cpp diff --git a/data/complexclust.sh b/data/complexclust.sh deleted file mode 100644 index 899f9d73..00000000 --- a/data/complexclust.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/sh -e -fail() { - echo "Error: $1" - exit 1 -} - -notExists() { - [ ! -f "$1" ] -} - -exists() { - [ -f "$1" ] -} - -# check number of input variables -[ "$#" -ne 4 ] && echo "Please provide " && exit 1; -# check if files exist -[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; -[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1; -[ -f "$3.dbtype" ] && echo "$3.dbtype exists already!" && exit 1; -[ ! -d "$4" ] && echo "tmp directory $4 not found!" && mkdir -p "$4"; - -INPUTSEQ="$1" -INPUTCLUST=="$2" -OUTCLUST="$3" -TMP_PATH="$4" - -COMPLEXINPUT="${TMP_PATH}/${INPUTSEQ}_com" - - -"$MMSEQS" clust "$COMPLEXINPUT" "$INPUTCLUST" "$OUTCLUST" ${CLUSTER_PAR} - diff --git a/data/complexcluster.sh b/data/easycomplexcluster.sh similarity index 72% rename from data/complexcluster.sh rename to data/easycomplexcluster.sh index 91e471fc..f6e21878 100644 --- a/data/complexcluster.sh +++ b/data/easycomplexcluster.sh @@ -12,6 +12,42 @@ exists() { [ -f "$1" ] } +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +# Shift initial DB to complexDB using soft-linking +# $1: input db +# $2: output db +buildCmplDb() { + touch "${2}" + awk -F"\t" 'BEGIN {OFFSET=0} + FNR==NR{chain_len[$1]=#3;next} + { + if !)_3 in off_arr) { + off_arr[$3]=OFFSET + } + cmpl_len[$3]=chain_len[$1];OFFSET+=chain_len[$1] + } + END { + for (cmpl in off_arr) { + print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] + } + }' "${1}.index" "${1}.lookup" > "${2}.index" + ln -s "$(abspath "${1}")" "${2}.1" + cp "${1}.dbtype" "${2}.dbtype" +} + # check number of input variables [ "$#" -ne 3 ] && echo "Please provide " && exit 1; # check if files exist @@ -68,23 +104,11 @@ if notExists "${TMP_PATH}/complex_filt"; then || fail "FilterComplex died" fi -# FIXME : twickDB w/ awk -> db also need to be changed? +# FIXME : softlink source to complexDB +if notExists "${TMP_PATH}/cmpl_db.dbtype"; then + buildCmplDb "${SOURCE}" "${TMP_PATH}/cmpl_db" +fi INPUT="${TMP_PATH}/cmpl_db" -awk -F"\t" ' - BEGIN {OFFSET=0} - NR==FNR {chain_len[$1]=$3;next} - { - if !($3 in off_arr) { - off_arr[$3]=OFFSET - } - cmpl_len[$3]=chain_len[$1];OFFSET+=chain_len[$1] - } - END { - for (cmpl in off_arr) { - print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] - } -}' "${SOURCE}.index" "${SOURCE}.lookup" > "${TMP_PATH}/cmpl_db.index" - # FIXME : clust if notExists "${TMP_PATH}/clu.dbtype"; then diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp new file mode 100644 index 00000000..4052776a --- /dev/null +++ b/src/workflow/EasyComplexCluster.cpp @@ -0,0 +1,121 @@ +#include + +#include "LocalParameters.h" +#include "FileUtil.h" +#include "CommandCaller.h" +#include "Util.h" +#include "Debug.h" + +#include "complexcluster.sh.h" + +namespace structure{ +#include "easycluster.sh.h" +} + +int easycomplexcluster(int argc, const char **argv, const Command &command) { + // LocalParameters &par = LocalParameters::getLocalInstance(); + // par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT); + // for (size_t i = 0; i < par.createdb.size(); i++){ + // par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); + // } + + // par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + + // par.parseParameters(argc, argv, command, false, Parameters::PARSE_VARIADIC, 0); + // if(par.PARAM_FORMAT_OUTPUT.wasSet == false){ + // par.outfmt = "query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,complexassignid"; + // } + // par.addBacktrace = true; + // par.PARAM_ADD_BACKTRACE.wasSet = true; + // par.printParameters(command.cmd, argc, argv, *command.params); + + // bool needBacktrace = false; + // bool needTaxonomy = false; + // bool needTaxonomyMapping = false; + // bool needLookup = false; + // { + // bool needSequenceDB = false; + // bool needFullHeaders = false; + // bool needSource = false; + // bool needQCA = false; + // bool needTCA = false; + // bool needTMalign = false; + // bool needLDDT = false; + // LocalParameters::getOutputFormat(par.formatAlignmentMode, par.outfmt, needSequenceDB, needBacktrace, needFullHeaders, + // needLookup, needSource, needTaxonomyMapping, needTaxonomy, needQCA, needTCA, needTMalign, needLDDT); + // } + + // if (par.formatAlignmentMode == Parameters::FORMAT_ALIGNMENT_SAM || + // par.formatAlignmentMode == LocalParameters::FORMAT_ALIGNMENT_PDB_SUPERPOSED || + // par.greedyBestHits) { + // needBacktrace = true; + // } + // if (needBacktrace) { + // Debug(Debug::INFO) << "Alignment backtraces will be computed, since they were requested by output format.\n"; + // par.addBacktrace = true; + // par.PARAM_ADD_BACKTRACE.wasSet = true; + // } + // if (needLookup) { + // par.writeLookup = true; + // } + + // std::string tmpDir = par.filenames.back(); + // std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + // if (par.reuseLatest) { + // hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); + // } + // tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); + // par.filenames.pop_back(); + // CommandCaller cmd; + // if(par.alignmentType == LocalParameters::ALIGNMENT_TYPE_TMALIGN){ + // cmd.addVariable("COMPLEX_ALIGNMENT_ALGO", "tmalign"); + // cmd.addVariable("COMPLEX_ALIGN_PAR", par.createParameterString(par.tmalign).c_str()); + // }else if(par.alignmentType == LocalParameters::ALIGNMENT_TYPE_3DI_AA || par.alignmentType == LocalParameters::ALIGNMENT_TYPE_3DI){ + // cmd.addVariable("COMPLEX_ALIGNMENT_ALGO", "structurealign"); + // cmd.addVariable("COMPLEX_ALIGN_PAR", par.createParameterString(par.structurealign).c_str()); + // } + + // switch(par.prefMode){ + // case LocalParameters::PREF_MODE_KMER: + // cmd.addVariable("PREFMODE", "KMER"); + // break; + // case LocalParameters::PREF_MODE_UNGAPPED: + // cmd.addVariable("PREFMODE", "UNGAPPED"); + // break; + // case LocalParameters::PREF_MODE_EXHAUSTIVE: + // cmd.addVariable("PREFMODE", "EXHAUSTIVE"); + // break; + // } + // if(par.exhaustiveSearch){ + // cmd.addVariable("PREFMODE", "EXHAUSTIVE"); + // } + // cmd.addVariable("NO_REPORT", par.complexReportMode == 0 ? "TRUE" : NULL); + // cmd.addVariable("TMP_PATH", tmpDir.c_str()); + // cmd.addVariable("OUTPUT", par.filenames.back().c_str()); + // par.filenames.pop_back(); + // cmd.addVariable("TARGET", par.filenames.back().c_str()); + // par.filenames.pop_back(); + // cmd.addVariable("QUERY", par.filenames.back().c_str()); + // cmd.addVariable("LEAVE_INPUT", par.dbOut ? "TRUE" : NULL); + // par.filenames.pop_back(); + // cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); + // cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow, true).c_str()); + // cmd.addVariable("CONVERT_PAR", par.createParameterString(par.convertalignments).c_str()); + // cmd.addVariable("REPORT_PAR", par.createParameterString(par.createcomplexreport).c_str()); + // cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); + // cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); + // cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str()); + // std::string program = tmpDir + "/easycomplexsearch.sh"; + // FileUtil::writeFile(program, easycomplexsearch_sh, easycomplexsearch_sh_len); + // cmd.execProgram(program.c_str(), par.filenames); + // // Should never get here + // assert(false); + // return EXIT_FAILURE; +} \ No newline at end of file From 46611c48b9ee40d23a7886e291a5c8de43b74656 Mon Sep 17 00:00:00 2001 From: sooyoung Date: Wed, 21 Feb 2024 15:27:45 +0900 Subject: [PATCH 012/160] CMakeLists update --- src/workflow/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt index fa43f5f8..687ec8ce 100644 --- a/src/workflow/CMakeLists.txt +++ b/src/workflow/CMakeLists.txt @@ -8,5 +8,7 @@ set(workflow_source_files workflow/EasyStructureCluster.cpp workflow/EasyComplexSearch.cpp workflow/ComplexSearch.cpp + workflow/EasyComplexCluster.cpp + workflow/ComplexCluster.cpp PARENT_SCOPE ) From 38b6095855560fd86a51bff19f8099c2ee897ea6 Mon Sep 17 00:00:00 2001 From: sooyoung Date: Wed, 21 Feb 2024 15:28:44 +0900 Subject: [PATCH 013/160] data/CMakeLists update --- data/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index ad4256af..c969d2d5 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -15,6 +15,8 @@ set(COMPILED_RESOURCES vendor.js.zst complexsearch.sh easycomplexsearch.sh + easycomplexcluster.sh + complexcluster.sh ) set(GENERATED_OUTPUT_HEADERS "") From 83a465492dee67a9dbbaf6d6c4e2e555c8341e0b Mon Sep 17 00:00:00 2001 From: sooyoung Date: Wed, 21 Feb 2024 19:57:29 +0900 Subject: [PATCH 014/160] clustered results to flatfiles --- data/easycomplexcluster.sh | 49 ++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index f6e21878..8c7dcd66 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -32,19 +32,19 @@ abspath() { buildCmplDb() { touch "${2}" awk -F"\t" 'BEGIN {OFFSET=0} - FNR==NR{chain_len[$1]=#3;next} + FNR==NR{chain_len[$1]=$3;next} { - if !)_3 in off_arr) { + if (!($3 in off_arr)) { off_arr[$3]=OFFSET } - cmpl_len[$3]=chain_len[$1];OFFSET+=chain_len[$1] + cmpl_len[$3]+=chain_len[$1];OFFSET+=chain_len[$1] } END { for (cmpl in off_arr) { print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] } }' "${1}.index" "${1}.lookup" > "${2}.index" - ln -s "$(abspath "${1}")" "${2}.1" + ln -s "$(abspath "${1}")" "${2}" cp "${1}.dbtype" "${2}.dbtype" } @@ -117,14 +117,53 @@ if notExists "${TMP_PATH}/clu.dbtype"; then || fail "Clustering died" fi +# DOING : make tsv file +if notExists "${TMP_PATH}/cluster.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "$2" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + || fail "Convert Alignments died" +fi + +# FIXME : make rep_seq.fasta, and how ? +if notExists "${TMP_PATH}/rep_seq.fasta"; then + # shellcheck disable=SC2086 + "$MMSEQS" result2repseq "${INPUT}" "$2" "${TMP_PATH}/clu_rep" ${RESULT2REPSEQ_PAR} \ + || fail "Result2repseq died" + + # shellcheck disable=SC2086 + "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/clu_rep" "${TMP_PATH}/rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ + || fail "result2flat died" +fi + +# FIXME : make all_seq.fasta, and how ? +if notExists "${TMP_PATH}/all_seqs.fasta"; then + # shellcheck disable=SC2086 + "$MMSEQS" createseqfiledb "${INPUT}" "$2" "${TMP_PATH}/clu_seqs" ${THREADS_PAR} \ + || fail "Result2repseq died" + + # shellcheck disable=SC2086 + "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/clu_seqs" "${TMP_PATH}/all_seqs.fasta" ${VERBOSITY_PAR} \ + || fail "result2flat died" +fi + +mv "${TMP_PATH}/all_seqs.fasta" "${RESULTS}_all_seqs.fasta" +mv "${TMP_PATH}/rep_seq.fasta" "${RESULTS}_rep_seq.fasta" +mv "${TMP_PATH}/cluster.tsv" "${RESULTS}_cluster.tsv" + # DOING : remove tmp if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/result" ${VERBOSITY} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/clu_seqs" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/clu_rep" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "$2" ${VERBOSITY_PAR} if [ "$PREFMODE" != "EXHAUSTIVE" ]; then # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/result_expand_aligned" ${VERBOSITY} fi rm -rf "${TMP_PATH}/search_tmp" - rm -f "${TMP_PATH}/complexcluster.sh" + rm -f "${TMP_PATH}/easycomplexcluster.sh" fi \ No newline at end of file From 017ad0fe404818cd12b87d71786e2e2c4d303cfc Mon Sep 17 00:00:00 2001 From: sooyoung Date: Wed, 21 Feb 2024 21:30:04 +0900 Subject: [PATCH 015/160] Updated LocalParameter files --- src/commons/LocalParameters.cpp | 7 +++++++ src/commons/LocalParameters.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 420b14d8..f0a7f942 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -195,6 +195,13 @@ LocalParameters::LocalParameters() : easyscomplexsearchworkflow = combineList(easyscomplexsearchworkflow, createcomplexreport); easyscomplexsearchworkflow.push_back(&PARAM_COMPLEX_REPORT_MODE); + //complexclusterworkflow + complexclusterworkflow = combineList(complexsearchworkflow, filtercomplex); + complexclusterworkflow = combineList(complexclusterworkflow, clust); + + //easycomplexclusterworkflow + easycomplexclusterworkflow = combineList(structurecreatedb, complexclusterworkflow); + // expandcomplex expandcomplex.push_back(&PARAM_THREADS); expandcomplex.push_back(&PARAM_V); diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index ae0f50a1..0e85c367 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -90,6 +90,8 @@ class LocalParameters : public Parameters { std::vector compressca; std::vector scorecomplex; std::vector filtercomplex; + std::vector complexclusterworkflow; + std::vector easycomplexclusterworkflow; std::vector complexsearchworkflow; std::vector easyscomplexsearchworkflow; std::vector createcomplexreport; From 84c5279fb65d48fb7119b138b4c1d6fa7a493178 Mon Sep 17 00:00:00 2001 From: sooyoung Date: Wed, 21 Feb 2024 21:57:16 +0900 Subject: [PATCH 016/160] FoldSeelBase.cpp should be changed though, easy-complexcluster output instruction --- data/CMakeLists.txt | 2 +- src/FoldseekBase.cpp | 42 ++++++++++++++++++++++++++++++++++ src/LocalCommandDeclarations.h | 2 ++ src/workflow/CMakeLists.txt | 2 +- 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index c969d2d5..22a2d0ae 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -15,8 +15,8 @@ set(COMPILED_RESOURCES vendor.js.zst complexsearch.sh easycomplexsearch.sh - easycomplexcluster.sh complexcluster.sh + easycomplexcluster.sh ) set(GENERATED_OUTPUT_HEADERS "") diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index e1599f09..50e96e1d 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -275,6 +275,48 @@ std::vector foldseekCommands = { {"clustDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb } } }, + {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_EASY, + "Complex level cluster", + "foldseek complexcluster queryDB result tmp\n" + "# Cluster output\n" + "# Important parameter: --cov-mode and -c \n" + "# --cov-mode \n" + "# 0 1 2\n" + "# Q: MAVGTACRPA 60% IGN 60%\n" + "# T: -AVGTAC--- 60% 100% IGN\n" + "# -c 0.7 - + -\n" + "# -c 0.6 + + +\n\n" + "Seongeun Kim & Sooyoung Cha ", + " ", + CITATION_FOLDSEEK, { + {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb}, + {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, + {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } + } + }, + {"easy-complexcluster", easycomplexcluster, &localPar.easycomplexclusterworkflow, COMMAND_EASY, + "Complex level cluster", + "foldseek easy-complexcluster example/1tim.pdb.gz result tmp\n" + "# Cluster output\n" + "FIX ME" + "# - result_rep_seq.fasta: Representatives\n" + "# - result_all_seq.fasta: FASTA-like per cluster\n" + "# - result_cluster.tsv: Adjacency list\n\n" + "# Important parameter: --cov-mode and -c \n" + "# --cov-mode \n" + "# 0 1 2\n" + "# Q: MAVGTACRPA 60% IGN 60%\n" + "# T: -AVGTAC--- 60% 100% IGN\n" + "# -c 0.7 - + -\n" + "# -c 0.6 + + +\n\n" + "Seongeun Kim & Sooyoung Cha ", + " ... ", + CITATION_FOLDSEEK, { + {"PDB|mmCIF[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &FoldSeekDbValidator::flatfileStdinAndFolder}, + {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, + {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } + } + }, {"complexsearch", complexsearch, &localPar.complexsearchworkflow, COMMAND_MAIN, "Complex level search", "# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n" diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index 60f2251d..2f6c00fd 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -22,6 +22,8 @@ extern int convert2pdb(int argc, const char** argv, const Command &command); extern int compressca(int argc, const char** argv, const Command &command); extern int scorecomplex(int argc, const char **argv, const Command& command); extern int filtercomplex(int argc, const char **argv, const Command& command); +extern int easycomplexcluster(int argc, const char** argv, const Command &command); +extern int complexcluster(int argc, const char** argv, const Command &command); extern int easycomplexsearch(int argc, const char **argv, const Command &command); extern int createcomplexreport(int argc, const char **argv, const Command &command); extern int expandcomplex(int argc, const char **argv, const Command &command); diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt index 687ec8ce..8d551f79 100644 --- a/src/workflow/CMakeLists.txt +++ b/src/workflow/CMakeLists.txt @@ -8,7 +8,7 @@ set(workflow_source_files workflow/EasyStructureCluster.cpp workflow/EasyComplexSearch.cpp workflow/ComplexSearch.cpp - workflow/EasyComplexCluster.cpp workflow/ComplexCluster.cpp + workflow/EasyComplexCluster.cpp PARENT_SCOPE ) From e15a22416c131742addbdb07129ce50996f0edc0 Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 22 Feb 2024 03:58:08 +0900 Subject: [PATCH 017/160] [IN PROGRESS] separated complexcluster and easycomplexcluster but need to organize --- data/complexcluster.sh | 74 +++++++++++++++++++++++++++++ data/easycomplexcluster.sh | 97 +++++++------------------------------- 2 files changed, 91 insertions(+), 80 deletions(-) create mode 100644 data/complexcluster.sh diff --git a/data/complexcluster.sh b/data/complexcluster.sh new file mode 100644 index 00000000..3aee8549 --- /dev/null +++ b/data/complexcluster.sh @@ -0,0 +1,74 @@ +#!/bin/sh -e +#TODO: maybe change file name into filtercomplex.sh +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + +exists() { + [ -f "$1" ] +} + +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +# Shift initial DB to complexDB using soft-linking +# $1: input db +# $2: output db +buildCmplDb() { + touch "${2}" + awk -F"\t" 'BEGIN {OFFSET=0} + FNR==NR{chain_len[$1]=$3;next} + { + if (!($3 in off_arr)) { + off_arr[$3]=OFFSET + } + cmpl_len[$3]+=chain_len[$1];OFFSET+=chain_len[$1] + } + END { + for (cmpl in off_arr) { + print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] + } + }' "${1}.index" "${1}.lookup" > "${2}.index" + ln -s "$(abspath "${1}")" "${2}.0" + cp "${1}.dbtype" "${2}.dbtype" +} + +# check number of input variables +[ "$#" -ne 4 ] && echo "Please provide " && exit 1; +# check if files exist +[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; +[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1; +[ ! -f "$3.dbtype" ] && echo "$3.dbtype not found!" && exit 1; +[ -f "$4.dbtype" ] && echo "$4.dbtype exists already!" && exit 1; #FIXME + +# TODO : replace TMP_PATH + +# DOING : filtercomplex +if notExists "$4"; then + # shellcheck disable=SC2086 + $MMSEQS filtercomplex "$1" "$2" "$3" "$4" ${FILTERCOMPLEX_PAR} \ + || fail "FilterComplex died" +fi + +# FIXME : softlink source to complexDB +if notExists "${TMP_PATH}/cmpl_db.dbtype"; then + buildCmplDb "${SOURCE}" "${TMP_PATH}/cmpl_db" +fi + +# TODO : remove tmp \ No newline at end of file diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 8c7dcd66..ff438c71 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -12,42 +12,6 @@ exists() { [ -f "$1" ] } -abspath() { - if [ -d "$1" ]; then - (cd "$1"; pwd) - elif [ -f "$1" ]; then - if [ -z "${1##*/*}" ]; then - echo "$(cd "${1%/*}"; pwd)/${1##*/}" - else - echo "$(pwd)/$1" - fi - elif [ -d "$(dirname "$1")" ]; then - echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" - fi -} - -# Shift initial DB to complexDB using soft-linking -# $1: input db -# $2: output db -buildCmplDb() { - touch "${2}" - awk -F"\t" 'BEGIN {OFFSET=0} - FNR==NR{chain_len[$1]=$3;next} - { - if (!($3 in off_arr)) { - off_arr[$3]=OFFSET - } - cmpl_len[$3]+=chain_len[$1];OFFSET+=chain_len[$1] - } - END { - for (cmpl in off_arr) { - print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] - } - }' "${1}.index" "${1}.lookup" > "${2}.index" - ln -s "$(abspath "${1}")" "${2}" - cp "${1}.dbtype" "${2}.dbtype" -} - # check number of input variables [ "$#" -ne 3 ] && echo "Please provide " && exit 1; # check if files exist @@ -68,52 +32,26 @@ if notExists "${INPUT}.dbtype"; then fi fi -# DOING : search -if notExists "${TMP_PATH}/result.dbtype"; then +# DOING : complexsearch +if notExists "${TMP_PATH}/complex_result.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" search "${INPUT}" "${INPUT}" "${TMP_PATH}/result" "${TMP_PATH}/search_tmp" ${SEARCH_PAR} \ - || fail "Search died" + "$MMSEQS" complexsearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \ + || fail "ComplexSearch died" fi -COMPDB="${TMP_PATH}/result" +COMPDB="${TMP_PATH}/complexsearch_tmp" -# FIX : expandcomplex ? -if [ "$PREFMODE" != "EXHAUSTIVE" ]; then - if notExists "${TMP_PATH}/result_expand_pref.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" expandcomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/result" "${TMP_PATH}/result_expand_pref" ${THREADS_PAR} \ - || fail "Expandcomplex died" - fi - if notExists "${TMP_PATH}/result_expand_aligned.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" $COMPLEX_ALIGNMENT_ALGO "${INPUT}" "${INPUT}" "${TMP_PATH}/result_expand_pref" "${TMP_PATH}/result_expand_aligned" ${COMPLEX_ALIGN_PAR} \ - || fail $COMPLEX_ALIGNMENT_ALGO "died" - fi - COMPDB="${TMP_PATH}/result_expand_aligned" -fi -# DOING : scorecomplex -if notExists "${TMP_PATH}/result_complex.dbtype"; then - # shellcheck disable=SC2086 - $MMSEQS scorecomplex "${INPUT}" "${INPUT}" "${COMPTDB}" "${TMP_PATH}/result_complex" ${SCORECOMPLEX_PAR} \ - || fail "ScoreComplex died" -fi - -# DOING : filtercomplex -if notExists "${TMP_PATH}/complex_filt"; then - # shellcheck disable=SC2086 - $MMSEQS filtercomplex "${INPUT}" "${INPUT}" "${COMPDB}" "${TMP_PATH}/result_cmplfilt" ${FILTERCOMPLEX_PAR} \ - || fail "FilterComplex died" -fi - -# FIXME : softlink source to complexDB +# DOING : call complexcluster or filtercomplex+awk +# TODO : maybe save filtercomplex result file to sub-dir of TMP_PATH if notExists "${TMP_PATH}/cmpl_db.dbtype"; then - buildCmplDb "${SOURCE}" "${TMP_PATH}/cmpl_db" + $MMSEQS "${FILTER_MODULE} "${INPUT}" "${INPUT}" "${COMPDB}" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ + || fail "FilterComplex died" fi -INPUT="${TMP_PATH}/cmpl_db" +INPUT="${TMP_PATH}/cmpl" # FIXME : clust if notExists "${TMP_PATH}/clu.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" clust "${INPUT}" "${TMP_PATH}/result_cmplfilt" "$2" ${CLUSTER_PAR} \ + "$MMSEQS" clust "${INPUT}" "${TMP_PATH}/complex_filt" "{TMP_PATH}/$2" ${CLUSTER_PAR} \ || fail "Clustering died" fi @@ -125,6 +63,7 @@ if notExists "${TMP_PATH}/cluster.tsv"; then fi # FIXME : make rep_seq.fasta, and how ? +# TODO: figure out how to represent complex sequences as a single fasta entry? if notExists "${TMP_PATH}/rep_seq.fasta"; then # shellcheck disable=SC2086 "$MMSEQS" result2repseq "${INPUT}" "$2" "${TMP_PATH}/clu_rep" ${RESULT2REPSEQ_PAR} \ @@ -150,20 +89,18 @@ mv "${TMP_PATH}/all_seqs.fasta" "${RESULTS}_all_seqs.fasta" mv "${TMP_PATH}/rep_seq.fasta" "${RESULTS}_rep_seq.fasta" mv "${TMP_PATH}/cluster.tsv" "${RESULTS}_cluster.tsv" -# DOING : remove tmp +# TODO : remove tmp -> tide up and organize if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/result" ${VERBOSITY} + "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/clu_seqs" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/clu_rep" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "$2" ${VERBOSITY_PAR} - if [ "$PREFMODE" != "EXHAUSTIVE" ]; then - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/result_expand_aligned" ${VERBOSITY} - fi - rm -rf "${TMP_PATH}/search_tmp" + rm -rf "${TMP_PATH}/complexsearch_tmp" rm -f "${TMP_PATH}/easycomplexcluster.sh" fi \ No newline at end of file From 3782b55053b4b8f32d6f6b07a99173b0e43d57f1 Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 22 Feb 2024 03:58:30 +0900 Subject: [PATCH 018/160] Made workflow file --- src/workflow/ComplexCluster.cpp | 20 ++++++++++++++++++++ src/workflow/EasyComplexCluster.cpp | 12 +++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) create mode 100644 src/workflow/ComplexCluster.cpp diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp new file mode 100644 index 00000000..73afe302 --- /dev/null +++ b/src/workflow/ComplexCluster.cpp @@ -0,0 +1,20 @@ +// TODO : Do we need to name this file filterCluster? +#include + +#include "LocalParameters.h" +#include "FileUtil.h" +#include "CommandCaller.h" +#include "Util.h" +#include "Debug.h" + +#include "complexcluster.sh.h" + +// namespace structure{ +// #include "easycluster.sh.h" +// #include "LocalCommandDeclarations.h" +// } + +int complexcluster(int argc, const char **argv, const Command &command) +{ + return 0; +} \ No newline at end of file diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index 4052776a..130636a0 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -6,11 +6,12 @@ #include "Util.h" #include "Debug.h" -#include "complexcluster.sh.h" +#include "easycomplexcluster.sh.h" -namespace structure{ -#include "easycluster.sh.h" -} +// namespace structure{ +// #include "easycluster.sh.h" +// #include "LocalCommandDeclarations.h" +// } int easycomplexcluster(int argc, const char **argv, const Command &command) { // LocalParameters &par = LocalParameters::getLocalInstance(); @@ -118,4 +119,5 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { // // Should never get here // assert(false); // return EXIT_FAILURE; -} \ No newline at end of file + return 0; +} From 8667be3dfc8b53085b8afb6f4f933a66d5ddd254 Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 22 Feb 2024 04:06:36 +0900 Subject: [PATCH 019/160] [TODO] Build failed. check localparameters, workflowfiles, etc. --- src/FoldseekBase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 50e96e1d..1d66809b 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -275,7 +275,7 @@ std::vector foldseekCommands = { {"clustDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb } } }, - {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_EASY, + {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_MAIN, //TODO: maybe COMMAND_MAIN? "Complex level cluster", "foldseek complexcluster queryDB result tmp\n" "# Cluster output\n" From 413faeeb2922d7eba4b87680116e78572c6a9e77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=B0=A8=EC=BD=94?= Date: Mon, 26 Feb 2024 19:04:29 +0900 Subject: [PATCH 020/160] Add filtercomplex parameter for coverage --- src/commons/LocalParameters.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index f0a7f942..615634c4 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -28,7 +28,8 @@ LocalParameters::LocalParameters() : PARAM_INDEX_EXCLUDE(PARAM_INDEX_EXCLUDE_ID, "--index-exclude", "Index Exclusion", "Exclude parts of the index:\n0: Full index\n1: Exclude k-mer index (for use with --prefilter-mode 1)\n2: Exclude C-alpha coordinates (for use with --sort-by-structure-bits 0)\nFlags can be combined bit wise", typeid(int), (void *) &indexExclude, "^[0-3]{1}$", MMseqsParameter::COMMAND_EXPERT), PARAM_COMPLEX_REPORT_MODE(PARAM_COMPLEX_REPORT_MODE_ID, "--complex-report-mode", "Complex report mode", "Complex report mode:\n0: No report\n1: Write complex report", typeid(int), (void *) &complexReportMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), PARAM_EXPAND_COMPLEX_EVALUE(PARAM_EXPAND_COMPLEX_EVALUE_ID, "--expand-complex-evalue", "E-value threshold for expandcomplex", "E-value threshold for expandcomplex (range 0.0-inf)", typeid(double), (void *) &eValueThrExpandComplex, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN), - PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$") + PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"), + PARAM_FILTER_COVERAGE_THRESHOLD(PARAM_FILTER_COVERAGE_THRESHOLD_ID, "--filter-coverage", "Filtercomplex Coverage Threshold", "filters alignments with complex coverage > thr [0.0,1.0]",typeid(float), (void *) &filterCovThr, "^0(\\.[0-9]+)?|1(\\.0+)?$") { PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id"; PARAM_ALIGNMENT_MODE.regex = "^[0-3]{1}$"; @@ -177,7 +178,7 @@ LocalParameters::LocalParameters() : //filtercomplex filtercomplex.push_back(&PARAM_V); filtercomplex.push_back(&PARAM_THREADS); - filtercomplex.push_back(&PARAM_C); + filtercomplex.push_back(&PARAM_FILTER_COVERAGE_THRESHOLD); filtercomplex.push_back(&PARAM_COV_MODE); // createcomplexreport @@ -225,6 +226,7 @@ LocalParameters::LocalParameters() : coordStoreMode = COORD_STORE_MODE_CA_DIFF; clusterSearch = 0; inputFormat = 0; // auto detect + filterCovThr = 0.8; fileInclude = ".*"; fileExclude = "^$"; dbSuffixList = "_h,_ss,_ca"; From 40a0e71966729401ed3588f56c10d61e086f71ea Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 26 Feb 2024 23:02:01 +0900 Subject: [PATCH 021/160] to share status --- data/test_complexcluster.sh | 82 +++++++++++++++++++++++++++++ data/test_easycomplexcluster.sh | 91 +++++++++++++++++++++++++++++++++ src/commons/LocalParameters.h | 1 + 3 files changed, 174 insertions(+) create mode 100644 data/test_complexcluster.sh create mode 100644 data/test_easycomplexcluster.sh diff --git a/data/test_complexcluster.sh b/data/test_complexcluster.sh new file mode 100644 index 00000000..c051fa94 --- /dev/null +++ b/data/test_complexcluster.sh @@ -0,0 +1,82 @@ +#!/bin/sh -e +#TODO: maybe change file name into filtercomplex.sh +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + +exists() { + [ -f "$1" ] +} + +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +# Shift initial DB to complexDB using soft-linking +# $1: input db +# $2: output db +buildCmplDb() { + touch "${2}" + awk -F"\t" 'BEGIN {OFFSET=0} + FNR==NR{chain_len[$1]=$3;next} + { + if (!($3 in off_arr)) { + off_arr[$3]=OFFSET + } + cmpl_len[$3]+=chain_len[$1];OFFSET+=chain_len[$1] + } + END { + for (cmpl in off_arr) { + print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] + } + }' "${1}.index" "${1}.lookup" > "${2}.index" + ln -s "$(abspath "${1}")" "${2}.0" + cp "${1}.dbtype" "${2}.dbtype" +} + +# check number of input variables +[ "$#" -ne 3 ] && echo "Please provide " && exit 1; + +# TODO : replace TMP_PATH +FILTALN="${QUERY}_filtcomp" +# DOING : filtercomplex +if notExists "${TMP_PATH}/${FILTALN}.dbtype"; then + # shellcheck disable=SC2086 + $MMSEQS filtercomplex ${QUERY} ${TARGET} ${TMP_PATH}/${FILTALN} ${FILTERCOMPLEX_PAR} \ + || fail "FilterComplex died" +fi + +# FIXME : softlink source to complexDB +if notExists "${TMP_PATH}/cmpl_db.dbtype"; then + buildCmplDb "${SOURCE}" "${TMP_PATH}/cmpl_db" +fi + +INPUT="${TMP_PATH}/cmpl" +# FIXME : clust +if notExists "${TMP_PATH}/clu.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" clust "${INPUT}" ${TMP_PATH}/${FILTALN} "${RESULT}" ${CLUSTER_PAR} \ + || fail "Clustering died" +fi + +# TODO : remove tmp +if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/${FILTALN}" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/cmpl_db" ${VERBOSITY_PAR} + rm -rf ${TMP_PATH}/complexcluster.sh diff --git a/data/test_easycomplexcluster.sh b/data/test_easycomplexcluster.sh new file mode 100644 index 00000000..509c0167 --- /dev/null +++ b/data/test_easycomplexcluster.sh @@ -0,0 +1,91 @@ +#!/bin/sh -e +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + +exists() { + [ -f "$1" ] +} + +# check if files exist +[ ! -f "${INPUT}.dbtype" ] && echo "${INPUT}.dbtype not found!" && exit 1; +[ ! -d "${TMP_PATH}" ] && echo "tmp directory ${TMP_PATH} not found!" && mkdir -p "${TMP_PATH}"; + +# DOING : createdb +if notExists "${INPUT}.dbtype"; then + if notExists "${TMP_PATH}/query"; then + # shellcheck disable=SC2086 + "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/input" ${CREATEDB_PAR} \ + || fail "input createdb died" + fi +fi + +# DOING : complexsearch +if notExists "${TMP_PATH}/complex_result.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" complexsearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \ + || fail "ComplexSearch died" +fi +COMPDB="${TMP_PATH}/complexsearch_tmp" + +# DOING : call complexcluster or filtercomplex+awk +# TODO : maybe save filtercomplex result file to sub-dir of TMP_PATH +if notExists "${TMP_PATH}/${RESULT}.dbtype"; then + $MMSEQS "${CLUSTER_MODULE}" "${INPUT}" "${INPUT}" "${TMP_PATH}/${RESULT}" "${TMP_PATH}" ${COMPLEXCLUSTER_PAR} \ + || fail "ClusterComplex died" +fi + +# DOING : make tsv file +if notExists "${TMP_PATH}/cluster.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/${RESULTS}" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + || fail "Convert Alignments died" +fi + +# FIXME : make rep_seq.fasta, and how ? +# TODO: figure out how to represent complex sequences as a single fasta entry? +if notExists "${TMP_PATH}/rep_seq.fasta"; then + # shellcheck disable=SC2086 + "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/${RESULTS}" "${TMP_PATH}/clu_rep" ${RESULT2REPSEQ_PAR} \ + || fail "Result2repseq died" + + # shellcheck disable=SC2086 + "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/clu_rep" "${TMP_PATH}/rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ + || fail "result2flat died" +fi + +# FIXME : make all_seq.fasta, and how ? +if notExists "${TMP_PATH}/all_seqs.fasta"; then + # shellcheck disable=SC2086 + "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/${RESULTS}" "${TMP_PATH}/clu_seqs" ${THREADS_PAR} \ + || fail "Result2repseq died" + + # shellcheck disable=SC2086 + "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/clu_seqs" "${TMP_PATH}/all_seqs.fasta" ${VERBOSITY_PAR} \ + || fail "result2flat died" +fi + +mv "${TMP_PATH}/all_seqs.fasta" "${RESULTS}_all_seqs.fasta" +mv "${TMP_PATH}/rep_seq.fasta" "${RESULTS}_rep_seq.fasta" +mv "${TMP_PATH}/cluster.tsv" "${RESULTS}_cluster.tsv" + +# TODO : remove tmp -> tide up and organize +if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/clu_seqs" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/clu_rep" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "$2" ${VERBOSITY_PAR} + rm -rf "${TMP_PATH}/complexsearch_tmp" + rm -f "${TMP_PATH}/easycomplexcluster.sh" +fi \ No newline at end of file diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 0e85c367..b43c21ad 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -117,6 +117,7 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_COMPLEX_REPORT_MODE) PARAMETER(PARAM_EXPAND_COMPLEX_EVALUE) PARAMETER(PARAM_INPUT_FORMAT) + PARAMETER(PARAM_FILTER_COVERAGE_THRESHOLD) int prefMode; float tmScoreThr; From 47cfb386373c9fcb990cb9668231c2ca626c0001 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 26 Feb 2024 23:05:03 +0900 Subject: [PATCH 022/160] share status --- src/workflow/EasyComplexCluster.cpp | 179 ++++++++++++---------------- 1 file changed, 76 insertions(+), 103 deletions(-) diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index 130636a0..8fce0268 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -8,116 +8,89 @@ #include "easycomplexcluster.sh.h" -// namespace structure{ -// #include "easycluster.sh.h" -// #include "LocalCommandDeclarations.h" -// } +void setEasyComplexClusterDefaults(Parameters *p) { + //TODO + // p->PARAM_C = 0.8; + p->PARAM_COV_MODE = 1; + p->sensitivity = 4; + p->PARAM_CLUSTER_MODE = Parameters::GREEDY; + p->evalThr = 0.001; + p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; + p->gapOpen = 10; + p->gapExtend = 1; +} + +void setEasyComplexClusterMustPassAlong(Parameters *p) { + // p->PARAM_C.wasSet = true; + p->PARAM_E.wasSet = true; + p->PARAM_ALIGNMENT_MODE.wasSet = true; + p->PARAM_S.wasSet = true; + par->addBacktrace = true; + par->PARAM_ADD_BACKTRACE.wasSet = true; +} int easycomplexcluster(int argc, const char **argv, const Command &command) { - // LocalParameters &par = LocalParameters::getLocalInstance(); - // par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT); - // for (size_t i = 0; i < par.createdb.size(); i++){ - // par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); - // } + LocalParameters &par = LocalParameters::getLocalInstance(); + par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT); + + for (size_t i = 0; i < par.createdb.size(); i++){ + par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); + } + par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + + setEasyComplexSearchDefaults(&par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + setEasyComplexClusterMustPassAlong(&par); + + if (par.formatAlignmentMode == Parameters::FORMAT_ALIGNMENT_SAM || + par.formatAlignmentMode == LocalParameters::FORMAT_ALIGNMENT_PDB_SUPERPOSED || + par.greedyBestHits) { + needBacktrace = true; + } + if (needBacktrace) { + Debug(Debug::INFO) << "Alignment backtraces will be computed, since they were requested by output format.\n"; + par.addBacktrace = true; + par.PARAM_ADD_BACKTRACE.wasSet = true; + } + + std::string tmpDir = par.filenames.back(); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + if (par.reuseLatest) { + hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); + } + tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); + par.filenames.pop_back(); - // par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + cmd.addVariable("TMP_PATH", tmpDir.c_str()); + cmd.addVariable("RESULT", par.filenames.back().c_str()); + par.filenames.pop_back(); + cmd.addVariable("INPUT", par.filenames.back().c_str()); + par.filenames.pop_back(); - // par.parseParameters(argc, argv, command, false, Parameters::PARSE_VARIADIC, 0); - // if(par.PARAM_FORMAT_OUTPUT.wasSet == false){ - // par.outfmt = "query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,complexassignid"; - // } - // par.addBacktrace = true; - // par.PARAM_ADD_BACKTRACE.wasSet = true; - // par.printParameters(command.cmd, argc, argv, *command.params); + cmd.addVariable("CLUSTER_MODULE", "complexcluster"); + cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); + cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow).c_str()); + cmd.addVariable("COMPLEXCLUSTER_PAR", par.createParameterString(par.complexclusterworkflow).c_str()); + cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); + cmd.addVariable("RESULT2REPSEQ_PAR", par.createParameterString(par.result2repseq).c_str()); + cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); - // bool needBacktrace = false; - // bool needTaxonomy = false; - // bool needTaxonomyMapping = false; - // bool needLookup = false; - // { - // bool needSequenceDB = false; - // bool needFullHeaders = false; - // bool needSource = false; - // bool needQCA = false; - // bool needTCA = false; - // bool needTMalign = false; - // bool needLDDT = false; - // LocalParameters::getOutputFormat(par.formatAlignmentMode, par.outfmt, needSequenceDB, needBacktrace, needFullHeaders, - // needLookup, needSource, needTaxonomyMapping, needTaxonomy, needQCA, needTCA, needTMalign, needLDDT); - // } + cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); - // if (par.formatAlignmentMode == Parameters::FORMAT_ALIGNMENT_SAM || - // par.formatAlignmentMode == LocalParameters::FORMAT_ALIGNMENT_PDB_SUPERPOSED || - // par.greedyBestHits) { - // needBacktrace = true; - // } - // if (needBacktrace) { - // Debug(Debug::INFO) << "Alignment backtraces will be computed, since they were requested by output format.\n"; - // par.addBacktrace = true; - // par.PARAM_ADD_BACKTRACE.wasSet = true; - // } - // if (needLookup) { - // par.writeLookup = true; - // } + std::string program = tmpDir + "/easycomplexcluster.sh"; + FileUtil::writeFile(program, easycomplexcluster_sh, easycomplexcluster_sh_len); + cmd.execProgram(program.c_str(), par.filenames); - // std::string tmpDir = par.filenames.back(); - // std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); - // if (par.reuseLatest) { - // hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); - // } - // tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); - // par.filenames.pop_back(); - // CommandCaller cmd; - // if(par.alignmentType == LocalParameters::ALIGNMENT_TYPE_TMALIGN){ - // cmd.addVariable("COMPLEX_ALIGNMENT_ALGO", "tmalign"); - // cmd.addVariable("COMPLEX_ALIGN_PAR", par.createParameterString(par.tmalign).c_str()); - // }else if(par.alignmentType == LocalParameters::ALIGNMENT_TYPE_3DI_AA || par.alignmentType == LocalParameters::ALIGNMENT_TYPE_3DI){ - // cmd.addVariable("COMPLEX_ALIGNMENT_ALGO", "structurealign"); - // cmd.addVariable("COMPLEX_ALIGN_PAR", par.createParameterString(par.structurealign).c_str()); - // } - // switch(par.prefMode){ - // case LocalParameters::PREF_MODE_KMER: - // cmd.addVariable("PREFMODE", "KMER"); - // break; - // case LocalParameters::PREF_MODE_UNGAPPED: - // cmd.addVariable("PREFMODE", "UNGAPPED"); - // break; - // case LocalParameters::PREF_MODE_EXHAUSTIVE: - // cmd.addVariable("PREFMODE", "EXHAUSTIVE"); - // break; - // } - // if(par.exhaustiveSearch){ - // cmd.addVariable("PREFMODE", "EXHAUSTIVE"); - // } - // cmd.addVariable("NO_REPORT", par.complexReportMode == 0 ? "TRUE" : NULL); - // cmd.addVariable("TMP_PATH", tmpDir.c_str()); - // cmd.addVariable("OUTPUT", par.filenames.back().c_str()); - // par.filenames.pop_back(); - // cmd.addVariable("TARGET", par.filenames.back().c_str()); - // par.filenames.pop_back(); - // cmd.addVariable("QUERY", par.filenames.back().c_str()); - // cmd.addVariable("LEAVE_INPUT", par.dbOut ? "TRUE" : NULL); - // par.filenames.pop_back(); - // cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); - // cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow, true).c_str()); - // cmd.addVariable("CONVERT_PAR", par.createParameterString(par.convertalignments).c_str()); - // cmd.addVariable("REPORT_PAR", par.createParameterString(par.createcomplexreport).c_str()); - // cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); - // cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); - // cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str()); - // std::string program = tmpDir + "/easycomplexsearch.sh"; - // FileUtil::writeFile(program, easycomplexsearch_sh, easycomplexsearch_sh_len); - // cmd.execProgram(program.c_str(), par.filenames); - // // Should never get here - // assert(false); - // return EXIT_FAILURE; + // Should never get here + assert(false); + return EXIT_FAILURE; return 0; } From 39b2f0627d0783440e5027344b8a4ad77dc94b32 Mon Sep 17 00:00:00 2001 From: rachelse Date: Tue, 27 Feb 2024 13:00:13 +0900 Subject: [PATCH 023/160] renamed complexcluster.sh to filtercomplex.sh and finalized --- data/{complexcluster.sh => filtercomplex.sh} | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) rename data/{complexcluster.sh => filtercomplex.sh} (88%) diff --git a/data/complexcluster.sh b/data/filtercomplex.sh similarity index 88% rename from data/complexcluster.sh rename to data/filtercomplex.sh index 3aee8549..b6c425b5 100644 --- a/data/complexcluster.sh +++ b/data/filtercomplex.sh @@ -55,20 +55,16 @@ buildCmplDb() { [ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; [ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1; [ ! -f "$3.dbtype" ] && echo "$3.dbtype not found!" && exit 1; -[ -f "$4.dbtype" ] && echo "$4.dbtype exists already!" && exit 1; #FIXME +[ -f "$4.dbtype" ] && echo "$4.dbtype exists already!" && exit 1; -# TODO : replace TMP_PATH - -# DOING : filtercomplex if notExists "$4"; then # shellcheck disable=SC2086 $MMSEQS filtercomplex "$1" "$2" "$3" "$4" ${FILTERCOMPLEX_PAR} \ || fail "FilterComplex died" fi -# FIXME : softlink source to complexDB -if notExists "${TMP_PATH}/cmpl_db.dbtype"; then - buildCmplDb "${SOURCE}" "${TMP_PATH}/cmpl_db" +if notExists "${CMPLDB_PATH}/cmpl_db.dbtype"; then + buildCmplDb "${SOURCE}" "${CMPLDB_PATH}/cmpl_db" fi -# TODO : remove tmp \ No newline at end of file +# DONE : remove tmp -> No TMP file generated \ No newline at end of file From ec234b1d782699f2d7491e8e6d12fc2cb63e2882 Mon Sep 17 00:00:00 2001 From: rachelse Date: Tue, 27 Feb 2024 14:14:11 +0900 Subject: [PATCH 024/160] revised parameters for filtercomplex --- src/FoldseekBase.cpp | 42 ++++++++++++++++----------------- src/LocalCommandDeclarations.h | 2 +- src/commons/LocalParameters.cpp | 13 ++++++---- src/commons/LocalParameters.h | 2 +- 4 files changed, 31 insertions(+), 28 deletions(-) diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 1d66809b..b64a2929 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -263,8 +263,8 @@ std::vector foldseekCommands = { {"complexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb} } }, - {"filtercomplex", filtercomplex, &localPar.filtercomplex, COMMAND_HIDDEN, - "Filters complexes", + {"filtercomplex", filtercomplex, &localPar.filtercomplexworkflow, COMMAND_HIDDEN, + "Filters complexes and generates a new complexDB", //FIX: explain about output complexDB+clustDB? "foldseek filtercomplex queryDB targetDB alignmentDB complexDB -c 0.8 --cov-mode 1\n", "Seongeun Kim & Sooyoung Cha ", " ", @@ -275,25 +275,25 @@ std::vector foldseekCommands = { {"clustDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb } } }, - {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_MAIN, //TODO: maybe COMMAND_MAIN? - "Complex level cluster", - "foldseek complexcluster queryDB result tmp\n" - "# Cluster output\n" - "# Important parameter: --cov-mode and -c \n" - "# --cov-mode \n" - "# 0 1 2\n" - "# Q: MAVGTACRPA 60% IGN 60%\n" - "# T: -AVGTAC--- 60% 100% IGN\n" - "# -c 0.7 - + -\n" - "# -c 0.6 + + +\n\n" - "Seongeun Kim & Sooyoung Cha ", - " ", - CITATION_FOLDSEEK, { - {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb}, - {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, - {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } - } - }, + // {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_MAIN, //TODO: maybe COMMAND_MAIN? + // "Complex level cluster", + // "foldseek complexcluster queryDB result tmp\n" + // "# Cluster output\n" + // "# Important parameter: --cov-mode and -c \n" + // "# --cov-mode \n" + // "# 0 1 2\n" + // "# Q: MAVGTACRPA 60% IGN 60%\n" + // "# T: -AVGTAC--- 60% 100% IGN\n" + // "# -c 0.7 - + -\n" + // "# -c 0.6 + + +\n\n" + // "Seongeun Kim & Sooyoung Cha ", + // " ", + // CITATION_FOLDSEEK, { + // {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb}, + // {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, + // {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } + // } + // }, {"easy-complexcluster", easycomplexcluster, &localPar.easycomplexclusterworkflow, COMMAND_EASY, "Complex level cluster", "foldseek easy-complexcluster example/1tim.pdb.gz result tmp\n" diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index 2f6c00fd..e9e3344a 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -23,7 +23,7 @@ extern int compressca(int argc, const char** argv, const Command &command); extern int scorecomplex(int argc, const char **argv, const Command& command); extern int filtercomplex(int argc, const char **argv, const Command& command); extern int easycomplexcluster(int argc, const char** argv, const Command &command); -extern int complexcluster(int argc, const char** argv, const Command &command); +// extern int complexcluster(int argc, const char** argv, const Command &command); extern int easycomplexsearch(int argc, const char **argv, const Command &command); extern int createcomplexreport(int argc, const char **argv, const Command &command); extern int expandcomplex(int argc, const char **argv, const Command &command); diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 615634c4..e00dbeb0 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -196,12 +196,15 @@ LocalParameters::LocalParameters() : easyscomplexsearchworkflow = combineList(easyscomplexsearchworkflow, createcomplexreport); easyscomplexsearchworkflow.push_back(&PARAM_COMPLEX_REPORT_MODE); - //complexclusterworkflow - complexclusterworkflow = combineList(complexsearchworkflow, filtercomplex); - complexclusterworkflow = combineList(complexclusterworkflow, clust); + // TODO : filtercomplexworkflow + filtercomplexworkflow = filtercomplex; + // filtercomplexworkflow.push_back() + + // TODO: easycomplexclusterworkflow: after clust? + easycomplexclusterworkflow = combineList(structurecreatedb, complexsearchworkflow); + easycomplexclusterworkflow = combineList(easycomplexclusterworkflow, filtercomplexworkflow); + easycomplexclusterworkflow = combineList(easycomplexclusterworkflow, clust); - //easycomplexclusterworkflow - easycomplexclusterworkflow = combineList(structurecreatedb, complexclusterworkflow); // expandcomplex expandcomplex.push_back(&PARAM_THREADS); diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index b43c21ad..5c4caa12 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -90,7 +90,7 @@ class LocalParameters : public Parameters { std::vector compressca; std::vector scorecomplex; std::vector filtercomplex; - std::vector complexclusterworkflow; + std::vector filtercomplexworkflow; std::vector easycomplexclusterworkflow; std::vector complexsearchworkflow; std::vector easyscomplexsearchworkflow; From 03c635e54d894ed980fc318dc5bdc5d1bfe06237 Mon Sep 17 00:00:00 2001 From: rachelse Date: Tue, 27 Feb 2024 14:14:40 +0900 Subject: [PATCH 025/160] Changed ComplexCluster into FilterComplex --- src/workflow/{ComplexCluster.cpp => FilterComplex.cpp} | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) rename src/workflow/{ComplexCluster.cpp => FilterComplex.cpp} (54%) diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/FilterComplex.cpp similarity index 54% rename from src/workflow/ComplexCluster.cpp rename to src/workflow/FilterComplex.cpp index 73afe302..d7a26a04 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/FilterComplex.cpp @@ -1,18 +1,12 @@ -// TODO : Do we need to name this file filterCluster? #include -#include "LocalParameters.h" #include "FileUtil.h" #include "CommandCaller.h" #include "Util.h" #include "Debug.h" +#include "LocalParameters.h" -#include "complexcluster.sh.h" - -// namespace structure{ -// #include "easycluster.sh.h" -// #include "LocalCommandDeclarations.h" -// } +#include "filtercluster.sh.h" int complexcluster(int argc, const char **argv, const Command &command) { From f7b9508ef279b9729f3dbe6573c098a06446719a Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 27 Feb 2024 15:39:19 +0900 Subject: [PATCH 026/160] Changes --- data/easycomplexcluster.sh | 16 ++--- data/test_complexcluster.sh | 82 -------------------------- data/test_easycomplexcluster.sh | 91 ----------------------------- src/commons/LocalParameters.cpp | 11 ++-- src/commons/LocalParameters.h | 1 - src/workflow/EasyComplexCluster.cpp | 35 ++++------- 6 files changed, 23 insertions(+), 213 deletions(-) delete mode 100644 data/test_complexcluster.sh delete mode 100644 data/test_easycomplexcluster.sh diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index ff438c71..18a899bd 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -43,7 +43,7 @@ COMPDB="${TMP_PATH}/complexsearch_tmp" # DOING : call complexcluster or filtercomplex+awk # TODO : maybe save filtercomplex result file to sub-dir of TMP_PATH if notExists "${TMP_PATH}/cmpl_db.dbtype"; then - $MMSEQS "${FILTER_MODULE} "${INPUT}" "${INPUT}" "${COMPDB}" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ + $MMSEQS "${FILTER_MODULE}" "${INPUT}" "${INPUT}" "${COMPDB}" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ || fail "FilterComplex died" fi INPUT="${TMP_PATH}/cmpl" @@ -51,7 +51,7 @@ INPUT="${TMP_PATH}/cmpl" # FIXME : clust if notExists "${TMP_PATH}/clu.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" clust "${INPUT}" "${TMP_PATH}/complex_filt" "{TMP_PATH}/$2" ${CLUSTER_PAR} \ + "$MMSEQS" clust "${INPUT}" "${TMP_PATH}/complex_filt" "$2" ${CLUSTER_PAR} \ || fail "Clustering died" fi @@ -85,21 +85,23 @@ if notExists "${TMP_PATH}/all_seqs.fasta"; then || fail "result2flat died" fi -mv "${TMP_PATH}/all_seqs.fasta" "${RESULTS}_all_seqs.fasta" -mv "${TMP_PATH}/rep_seq.fasta" "${RESULTS}_rep_seq.fasta" -mv "${TMP_PATH}/cluster.tsv" "${RESULTS}_cluster.tsv" +mv "${TMP_PATH}/all_seqs.fasta" "${RESULT}_all_seqs.fasta" +mv "${TMP_PATH}/rep_seq.fasta" "${RESULT}_rep_seq.fasta" +mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" # TODO : remove tmp -> tide up and organize if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY} + "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY} + "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/clu_seqs" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/clu_rep" ${VERBOSITY_PAR} # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/complex_filt" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 "$MMSEQS" rmdb "$2" ${VERBOSITY_PAR} rm -rf "${TMP_PATH}/complexsearch_tmp" rm -f "${TMP_PATH}/easycomplexcluster.sh" diff --git a/data/test_complexcluster.sh b/data/test_complexcluster.sh deleted file mode 100644 index c051fa94..00000000 --- a/data/test_complexcluster.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/sh -e -#TODO: maybe change file name into filtercomplex.sh -fail() { - echo "Error: $1" - exit 1 -} - -notExists() { - [ ! -f "$1" ] -} - -exists() { - [ -f "$1" ] -} - -abspath() { - if [ -d "$1" ]; then - (cd "$1"; pwd) - elif [ -f "$1" ]; then - if [ -z "${1##*/*}" ]; then - echo "$(cd "${1%/*}"; pwd)/${1##*/}" - else - echo "$(pwd)/$1" - fi - elif [ -d "$(dirname "$1")" ]; then - echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" - fi -} - -# Shift initial DB to complexDB using soft-linking -# $1: input db -# $2: output db -buildCmplDb() { - touch "${2}" - awk -F"\t" 'BEGIN {OFFSET=0} - FNR==NR{chain_len[$1]=$3;next} - { - if (!($3 in off_arr)) { - off_arr[$3]=OFFSET - } - cmpl_len[$3]+=chain_len[$1];OFFSET+=chain_len[$1] - } - END { - for (cmpl in off_arr) { - print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] - } - }' "${1}.index" "${1}.lookup" > "${2}.index" - ln -s "$(abspath "${1}")" "${2}.0" - cp "${1}.dbtype" "${2}.dbtype" -} - -# check number of input variables -[ "$#" -ne 3 ] && echo "Please provide " && exit 1; - -# TODO : replace TMP_PATH -FILTALN="${QUERY}_filtcomp" -# DOING : filtercomplex -if notExists "${TMP_PATH}/${FILTALN}.dbtype"; then - # shellcheck disable=SC2086 - $MMSEQS filtercomplex ${QUERY} ${TARGET} ${TMP_PATH}/${FILTALN} ${FILTERCOMPLEX_PAR} \ - || fail "FilterComplex died" -fi - -# FIXME : softlink source to complexDB -if notExists "${TMP_PATH}/cmpl_db.dbtype"; then - buildCmplDb "${SOURCE}" "${TMP_PATH}/cmpl_db" -fi - -INPUT="${TMP_PATH}/cmpl" -# FIXME : clust -if notExists "${TMP_PATH}/clu.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" clust "${INPUT}" ${TMP_PATH}/${FILTALN} "${RESULT}" ${CLUSTER_PAR} \ - || fail "Clustering died" -fi - -# TODO : remove tmp -if [ -n "${REMOVE_TMP}" ]; then - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/${FILTALN}" ${VERBOSITY_PAR} - "$MMSEQS" rmdb "${TMP_PATH}/cmpl_db" ${VERBOSITY_PAR} - rm -rf ${TMP_PATH}/complexcluster.sh diff --git a/data/test_easycomplexcluster.sh b/data/test_easycomplexcluster.sh deleted file mode 100644 index 509c0167..00000000 --- a/data/test_easycomplexcluster.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/bin/sh -e -fail() { - echo "Error: $1" - exit 1 -} - -notExists() { - [ ! -f "$1" ] -} - -exists() { - [ -f "$1" ] -} - -# check if files exist -[ ! -f "${INPUT}.dbtype" ] && echo "${INPUT}.dbtype not found!" && exit 1; -[ ! -d "${TMP_PATH}" ] && echo "tmp directory ${TMP_PATH} not found!" && mkdir -p "${TMP_PATH}"; - -# DOING : createdb -if notExists "${INPUT}.dbtype"; then - if notExists "${TMP_PATH}/query"; then - # shellcheck disable=SC2086 - "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/input" ${CREATEDB_PAR} \ - || fail "input createdb died" - fi -fi - -# DOING : complexsearch -if notExists "${TMP_PATH}/complex_result.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" complexsearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \ - || fail "ComplexSearch died" -fi -COMPDB="${TMP_PATH}/complexsearch_tmp" - -# DOING : call complexcluster or filtercomplex+awk -# TODO : maybe save filtercomplex result file to sub-dir of TMP_PATH -if notExists "${TMP_PATH}/${RESULT}.dbtype"; then - $MMSEQS "${CLUSTER_MODULE}" "${INPUT}" "${INPUT}" "${TMP_PATH}/${RESULT}" "${TMP_PATH}" ${COMPLEXCLUSTER_PAR} \ - || fail "ClusterComplex died" -fi - -# DOING : make tsv file -if notExists "${TMP_PATH}/cluster.tsv"; then - # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/${RESULTS}" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ - || fail "Convert Alignments died" -fi - -# FIXME : make rep_seq.fasta, and how ? -# TODO: figure out how to represent complex sequences as a single fasta entry? -if notExists "${TMP_PATH}/rep_seq.fasta"; then - # shellcheck disable=SC2086 - "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/${RESULTS}" "${TMP_PATH}/clu_rep" ${RESULT2REPSEQ_PAR} \ - || fail "Result2repseq died" - - # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/clu_rep" "${TMP_PATH}/rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ - || fail "result2flat died" -fi - -# FIXME : make all_seq.fasta, and how ? -if notExists "${TMP_PATH}/all_seqs.fasta"; then - # shellcheck disable=SC2086 - "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/${RESULTS}" "${TMP_PATH}/clu_seqs" ${THREADS_PAR} \ - || fail "Result2repseq died" - - # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/clu_seqs" "${TMP_PATH}/all_seqs.fasta" ${VERBOSITY_PAR} \ - || fail "result2flat died" -fi - -mv "${TMP_PATH}/all_seqs.fasta" "${RESULTS}_all_seqs.fasta" -mv "${TMP_PATH}/rep_seq.fasta" "${RESULTS}_rep_seq.fasta" -mv "${TMP_PATH}/cluster.tsv" "${RESULTS}_cluster.tsv" - -# TODO : remove tmp -> tide up and organize -if [ -n "${REMOVE_TMP}" ]; then - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/clu_seqs" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/clu_rep" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "$2" ${VERBOSITY_PAR} - rm -rf "${TMP_PATH}/complexsearch_tmp" - rm -f "${TMP_PATH}/easycomplexcluster.sh" -fi \ No newline at end of file diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index e00dbeb0..5489eb8b 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -28,8 +28,7 @@ LocalParameters::LocalParameters() : PARAM_INDEX_EXCLUDE(PARAM_INDEX_EXCLUDE_ID, "--index-exclude", "Index Exclusion", "Exclude parts of the index:\n0: Full index\n1: Exclude k-mer index (for use with --prefilter-mode 1)\n2: Exclude C-alpha coordinates (for use with --sort-by-structure-bits 0)\nFlags can be combined bit wise", typeid(int), (void *) &indexExclude, "^[0-3]{1}$", MMseqsParameter::COMMAND_EXPERT), PARAM_COMPLEX_REPORT_MODE(PARAM_COMPLEX_REPORT_MODE_ID, "--complex-report-mode", "Complex report mode", "Complex report mode:\n0: No report\n1: Write complex report", typeid(int), (void *) &complexReportMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), PARAM_EXPAND_COMPLEX_EVALUE(PARAM_EXPAND_COMPLEX_EVALUE_ID, "--expand-complex-evalue", "E-value threshold for expandcomplex", "E-value threshold for expandcomplex (range 0.0-inf)", typeid(double), (void *) &eValueThrExpandComplex, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN), - PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"), - PARAM_FILTER_COVERAGE_THRESHOLD(PARAM_FILTER_COVERAGE_THRESHOLD_ID, "--filter-coverage", "Filtercomplex Coverage Threshold", "filters alignments with complex coverage > thr [0.0,1.0]",typeid(float), (void *) &filterCovThr, "^0(\\.[0-9]+)?|1(\\.0+)?$") + PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$") { PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id"; PARAM_ALIGNMENT_MODE.regex = "^[0-3]{1}$"; @@ -178,7 +177,7 @@ LocalParameters::LocalParameters() : //filtercomplex filtercomplex.push_back(&PARAM_V); filtercomplex.push_back(&PARAM_THREADS); - filtercomplex.push_back(&PARAM_FILTER_COVERAGE_THRESHOLD); + filtercomplex.push_back(&PARAM_C); filtercomplex.push_back(&PARAM_COV_MODE); // createcomplexreport @@ -198,13 +197,12 @@ LocalParameters::LocalParameters() : // TODO : filtercomplexworkflow filtercomplexworkflow = filtercomplex; - // filtercomplexworkflow.push_back() - // TODO: easycomplexclusterworkflow: after clust? + //easycomplexclusterworkflow easycomplexclusterworkflow = combineList(structurecreatedb, complexsearchworkflow); easycomplexclusterworkflow = combineList(easycomplexclusterworkflow, filtercomplexworkflow); easycomplexclusterworkflow = combineList(easycomplexclusterworkflow, clust); - + easycomplexclusterworkflow = combineList(easycomplexclusterworkflow, convertalignments); // expandcomplex expandcomplex.push_back(&PARAM_THREADS); @@ -229,7 +227,6 @@ LocalParameters::LocalParameters() : coordStoreMode = COORD_STORE_MODE_CA_DIFF; clusterSearch = 0; inputFormat = 0; // auto detect - filterCovThr = 0.8; fileInclude = ".*"; fileExclude = "^$"; dbSuffixList = "_h,_ss,_ca"; diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 5c4caa12..54a166e3 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -117,7 +117,6 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_COMPLEX_REPORT_MODE) PARAMETER(PARAM_EXPAND_COMPLEX_EVALUE) PARAMETER(PARAM_INPUT_FORMAT) - PARAMETER(PARAM_FILTER_COVERAGE_THRESHOLD) int prefMode; float tmScoreThr; diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index 8fce0268..146e8743 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -9,8 +9,8 @@ #include "easycomplexcluster.sh.h" void setEasyComplexClusterDefaults(Parameters *p) { - //TODO - // p->PARAM_C = 0.8; + //TODO, parameters for search, filtercomplex, cluster, createresults + p->PARAM_C = 0.8; p->PARAM_COV_MODE = 1; p->sensitivity = 4; p->PARAM_CLUSTER_MODE = Parameters::GREEDY; @@ -21,22 +21,19 @@ void setEasyComplexClusterDefaults(Parameters *p) { } void setEasyComplexClusterMustPassAlong(Parameters *p) { - // p->PARAM_C.wasSet = true; + p->PARAM_C.wasSet = true; p->PARAM_E.wasSet = true; - p->PARAM_ALIGNMENT_MODE.wasSet = true; p->PARAM_S.wasSet = true; - par->addBacktrace = true; - par->PARAM_ADD_BACKTRACE.wasSet = true; + p->PARAM_ALIGNMENT_MODE.wasSet = true; + p->addBacktrace = true; + p->PARAM_ADD_BACKTRACE.wasSet = true; } int easycomplexcluster(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); for (size_t i = 0; i < par.createdb.size(); i++){ par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); @@ -45,21 +42,10 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); - setEasyComplexSearchDefaults(&par); + setEasyComplexClusterDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); setEasyComplexClusterMustPassAlong(&par); - if (par.formatAlignmentMode == Parameters::FORMAT_ALIGNMENT_SAM || - par.formatAlignmentMode == LocalParameters::FORMAT_ALIGNMENT_PDB_SUPERPOSED || - par.greedyBestHits) { - needBacktrace = true; - } - if (needBacktrace) { - Debug(Debug::INFO) << "Alignment backtraces will be computed, since they were requested by output format.\n"; - par.addBacktrace = true; - par.PARAM_ADD_BACKTRACE.wasSet = true; - } - std::string tmpDir = par.filenames.back(); std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); if (par.reuseLatest) { @@ -74,10 +60,10 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { cmd.addVariable("INPUT", par.filenames.back().c_str()); par.filenames.pop_back(); - cmd.addVariable("CLUSTER_MODULE", "complexcluster"); + cmd.addVariable("CLUSTER_MODULE", "filtercomplex"); cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow).c_str()); - cmd.addVariable("COMPLEXCLUSTER_PAR", par.createParameterString(par.complexclusterworkflow).c_str()); + cmd.addVariable("FILTERCOMPLEX_PAR", par.createParameterString(par.filtercomplexworkflow).c_str()); cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("RESULT2REPSEQ_PAR", par.createParameterString(par.result2repseq).c_str()); cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); @@ -92,5 +78,4 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { // Should never get here assert(false); return EXIT_FAILURE; - return 0; } From dbd9b076f71874141daddcb4473adf4fdfceb8ba Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 27 Feb 2024 16:44:36 +0900 Subject: [PATCH 027/160] To Complexclusterworkflow --- src/FoldseekBase.cpp | 38 ++++++++++++++++----------------- src/LocalCommandDeclarations.h | 2 +- src/commons/LocalParameters.cpp | 9 ++++---- src/commons/LocalParameters.h | 2 +- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index b64a2929..981c5a83 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -275,25 +275,25 @@ std::vector foldseekCommands = { {"clustDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb } } }, - // {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_MAIN, //TODO: maybe COMMAND_MAIN? - // "Complex level cluster", - // "foldseek complexcluster queryDB result tmp\n" - // "# Cluster output\n" - // "# Important parameter: --cov-mode and -c \n" - // "# --cov-mode \n" - // "# 0 1 2\n" - // "# Q: MAVGTACRPA 60% IGN 60%\n" - // "# T: -AVGTAC--- 60% 100% IGN\n" - // "# -c 0.7 - + -\n" - // "# -c 0.6 + + +\n\n" - // "Seongeun Kim & Sooyoung Cha ", - // " ", - // CITATION_FOLDSEEK, { - // {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb}, - // {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, - // {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } - // } - // }, + {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_MAIN, + "Complex level cluster", + "foldseek complexcluster queryDB result tmp\n" + "# Cluster output\n" + "# Important parameter: --cov-mode and -c \n" + "# --cov-mode \n" + "# 0 1 2\n" + "# Q: MAVGTACRPA 60% IGN 60%\n" + "# T: -AVGTAC--- 60% 100% IGN\n" + "# -c 0.7 - + -\n" + "# -c 0.6 + + +\n\n" + "Seongeun Kim & Sooyoung Cha ", + " ", + CITATION_FOLDSEEK, { + {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb}, + {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, + {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } + } + }, {"easy-complexcluster", easycomplexcluster, &localPar.easycomplexclusterworkflow, COMMAND_EASY, "Complex level cluster", "foldseek easy-complexcluster example/1tim.pdb.gz result tmp\n" diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index e9e3344a..2f6c00fd 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -23,7 +23,7 @@ extern int compressca(int argc, const char** argv, const Command &command); extern int scorecomplex(int argc, const char **argv, const Command& command); extern int filtercomplex(int argc, const char **argv, const Command& command); extern int easycomplexcluster(int argc, const char** argv, const Command &command); -// extern int complexcluster(int argc, const char** argv, const Command &command); +extern int complexcluster(int argc, const char** argv, const Command &command); extern int easycomplexsearch(int argc, const char **argv, const Command &command); extern int createcomplexreport(int argc, const char **argv, const Command &command); extern int expandcomplex(int argc, const char **argv, const Command &command); diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 5489eb8b..657fc0ad 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -195,13 +195,12 @@ LocalParameters::LocalParameters() : easyscomplexsearchworkflow = combineList(easyscomplexsearchworkflow, createcomplexreport); easyscomplexsearchworkflow.push_back(&PARAM_COMPLEX_REPORT_MODE); - // TODO : filtercomplexworkflow - filtercomplexworkflow = filtercomplex; + // complexclusterworkflow + complexclusterworkflow = combineList(complexsearchworkflow, filtercomplex); + complexclusterworkflow = combineList(complexclusterworkflow, clust); //easycomplexclusterworkflow - easycomplexclusterworkflow = combineList(structurecreatedb, complexsearchworkflow); - easycomplexclusterworkflow = combineList(easycomplexclusterworkflow, filtercomplexworkflow); - easycomplexclusterworkflow = combineList(easycomplexclusterworkflow, clust); + easycomplexclusterworkflow = combineList(structurecreatedb, complexclusterworkflow); easycomplexclusterworkflow = combineList(easycomplexclusterworkflow, convertalignments); // expandcomplex diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 54a166e3..0e85c367 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -90,7 +90,7 @@ class LocalParameters : public Parameters { std::vector compressca; std::vector scorecomplex; std::vector filtercomplex; - std::vector filtercomplexworkflow; + std::vector complexclusterworkflow; std::vector easycomplexclusterworkflow; std::vector complexsearchworkflow; std::vector easyscomplexsearchworkflow; From 02a89148aa6dd1cf831cb17915eb3cf558a3f8ca Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 27 Feb 2024 17:50:23 +0900 Subject: [PATCH 028/160] easycc and cc .sh --- data/complexcluster.sh | 59 ++++++++++++++++++++++++++++++++++++++ data/easycomplexcluster.sh | 59 ++++++++++++-------------------------- 2 files changed, 77 insertions(+), 41 deletions(-) create mode 100644 data/complexcluster.sh diff --git a/data/complexcluster.sh b/data/complexcluster.sh new file mode 100644 index 00000000..2753496e --- /dev/null +++ b/data/complexcluster.sh @@ -0,0 +1,59 @@ +#!/bin/sh -e +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + +exists() { + [ -f "$1" ] +} + +# check number of input variables +[ "$#" -ne 3 ] && echo "Please provide " && exit 1; +# check if files exist +[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; +[ -f "$2.dbtype" ] && echo "$2.dbtype already exists!" && exit 1; +[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; + +# DOING : complexsearch +if notExists "${TMP_PATH}/complex_result.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" complexsearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complexsearch_aln" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \ + || fail "ComplexSearch died" +fi + +# DOING : filtercomplex +if notExists "${RESULT}_filt.dbtype"; then + # shellcheck disable=SC2086 + $MMSEQS filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complexsearch_aln" "${RESULT}_filt" ${FILTERCOMPLEX_PAR} \ + || fail "FilterComplex died" +fi + +# DOING : softlink source to complexDB +if notExists "${TMP_PATH}/cmpl_db.dbtype"; then + buildCmplDb "${INPUT}" "${TMP_PATH}/cmpl_db" +fi + +INPUT2="${TMP_PATH}/cmpl_db" + +# DOING : clust +if notExists "${RESULT}.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" clust "${INPUT2}" "${RESULT}_filt" "${RESULT}" ${CLUSTER_PAR} \ + || fail "Clustering died" +fi + + +# DOING: Remove tmp +if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${RESULT}_filt" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/complexsearch_aln" ${VERBOSITY_PAR} + rm -rf "${TMP_PATH}/complexsearch_tmp" + rm -f "${TMP_PATH}/easycomplexcluster.sh" +fi \ No newline at end of file diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 18a899bd..37b60e1c 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -13,75 +13,53 @@ exists() { } # check number of input variables -[ "$#" -ne 3 ] && echo "Please provide " && exit 1; +[ "$#" -ne 3 ] && echo "Please provide " && exit 1; # check if files exist -[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; -[ -f "$2.dbtype" ] && echo "$2.dbtype exists already!" && exit 1; [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; -INPUT="$1" -TMP_PATH="$3" -SOURCE="$INPUT" - # DOING : createdb -if notExists "${INPUT}.dbtype"; then - if notExists "${TMP_PATH}/query"; then - # shellcheck disable=SC2086 - "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/input" ${CREATEDB_PAR} \ - || fail "input createdb died" - fi -fi - -# DOING : complexsearch -if notExists "${TMP_PATH}/complex_result.dbtype"; then +if notExists "${TMP_PATH}/input.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" complexsearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \ - || fail "ComplexSearch died" + "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/input" ${CREATEDB_PAR} \ + || fail "input createdb died" + fi fi -COMPDB="${TMP_PATH}/complexsearch_tmp" +INPUT="${TMP_PATH}/input" -# DOING : call complexcluster or filtercomplex+awk -# TODO : maybe save filtercomplex result file to sub-dir of TMP_PATH +# DOING : complexcluster if notExists "${TMP_PATH}/cmpl_db.dbtype"; then - $MMSEQS "${FILTER_MODULE}" "${INPUT}" "${INPUT}" "${COMPDB}" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ - || fail "FilterComplex died" -fi -INPUT="${TMP_PATH}/cmpl" - -# FIXME : clust -if notExists "${TMP_PATH}/clu.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" clust "${INPUT}" "${TMP_PATH}/complex_filt" "$2" ${CLUSTER_PAR} \ - || fail "Clustering died" + $MMSEQS complexcluster "${INPUT}" "${RESULT}" "${TMP_PATH}" ${COMPLEXCLUSTER_PAR} \ + || fail "Complexcluster died" fi +INPUT2="${TMP_PATH}/cmpl" # DOING : make tsv file if notExists "${TMP_PATH}/cluster.tsv"; then # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "$2" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + "$MMSEQS" createtsv "${INPUT2}" "${INPUT2}" "${RESULT}" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ || fail "Convert Alignments died" fi -# FIXME : make rep_seq.fasta, and how ? +# FIXME : make rep_seq.fasta # TODO: figure out how to represent complex sequences as a single fasta entry? if notExists "${TMP_PATH}/rep_seq.fasta"; then # shellcheck disable=SC2086 - "$MMSEQS" result2repseq "${INPUT}" "$2" "${TMP_PATH}/clu_rep" ${RESULT2REPSEQ_PAR} \ + "$MMSEQS" result2repseq "${INPUT2}" "${RESULT}" "${TMP_PATH}/clu_rep" ${RESULT2REPSEQ_PAR} \ || fail "Result2repseq died" # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/clu_rep" "${TMP_PATH}/rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ + "$MMSEQS" result2flat "${INPUT2}" "${INPUT2}" "${TMP_PATH}/clu_rep" "${TMP_PATH}/rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ || fail "result2flat died" fi # FIXME : make all_seq.fasta, and how ? if notExists "${TMP_PATH}/all_seqs.fasta"; then # shellcheck disable=SC2086 - "$MMSEQS" createseqfiledb "${INPUT}" "$2" "${TMP_PATH}/clu_seqs" ${THREADS_PAR} \ + "$MMSEQS" createseqfiledb "${INPUT2}" "${RESULT}" "${TMP_PATH}/clu_seqs" ${THREADS_PAR} \ || fail "Result2repseq died" # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/clu_seqs" "${TMP_PATH}/all_seqs.fasta" ${VERBOSITY_PAR} \ + "$MMSEQS" result2flat "${INPUT2}" "${INPUT2}" "${TMP_PATH}/clu_seqs" "${TMP_PATH}/all_seqs.fasta" ${VERBOSITY_PAR} \ || fail "result2flat died" fi @@ -100,9 +78,8 @@ if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/clu_rep" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_filt" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${INPUT2}" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "$2" ${VERBOSITY_PAR} - rm -rf "${TMP_PATH}/complexsearch_tmp" + "$MMSEQS" rmdb "${RESULT}" ${VERBOSITY_PAR} rm -f "${TMP_PATH}/easycomplexcluster.sh" fi \ No newline at end of file From b9d7315068d7aaefea0d066f65460adbdc2586db Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 27 Feb 2024 18:24:29 +0900 Subject: [PATCH 029/160] .cpp files --- src/workflow/ComplexCluster.cpp | 80 +++++++++++++++++++++++++++++ src/workflow/EasyComplexCluster.cpp | 19 ++++--- 2 files changed, 89 insertions(+), 10 deletions(-) create mode 100644 src/workflow/ComplexCluster.cpp diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp new file mode 100644 index 00000000..5da3d5e0 --- /dev/null +++ b/src/workflow/ComplexCluster.cpp @@ -0,0 +1,80 @@ +#include + +#include "FileUtil.h" +#include "CommandCaller.h" +#include "Util.h" +#include "Debug.h" +#include "LocalParameters.h" + +#include "complexcluster.sh.h" + +void setEasyComplexClusterDefaults(Parameters *p) { + //TODO, parameters for search, filtercomplex, cluster, createresults + p->PARAM_C = 0.8; + p->PARAM_COV_MODE = 1; + p->PARAM_S = 4; + p->PARAM_CLUSTER_MODE = Parameters::GREEDY; + p->PARAM_E = 0.001; + p->PARAM_ALIGNMENT_MODE = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; + p->PARAM_GAP_OPEN = 10; + p->PARAM_GAP_EXTEND = 1; +} + +void setEasyComplexClusterMustPassAlong(Parameters *p) { + p->PARAM_C.wasSet = true; + p->PARAM_E.wasSet = true; + p->PARAM_S.wasSet = true; + p->PARAM_ALIGNMENT_MODE.wasSet = true; + p->PARAM_ADD_BACKTRACE = true; + p->PARAM_ADD_BACKTRACE.wasSet = true; + +} +int complexcluster(int argc, const char **argv, const Command &command) +{ + LocalParameters &par = LocalParameters::getLocalInstance(); + par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); + + for (size_t i = 0; i < par.createdb.size(); i++){ + par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); + } + par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + + setEasyComplexClusterDefaults(&par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + setEasyComplexClusterMustPassAlong(&par); + + std::string tmpDir = par.filenames.back(); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + if (par.reuseLatest) { + hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); + } + tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); + par.filenames.pop_back(); + + cmd.addVariable("TMP_PATH", tmpDir.c_str()); + cmd.addVariable("RESULT", par.filenames.back().c_str()); + par.filenames.pop_back(); + cmd.addVariable("INPUT", par.filenames.back().c_str()); + par.filenames.pop_back(); + + cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow).c_str()); + cmd.addVariable("FILTERCOMPLEX_PAR", par.createParameterString(par.filtercomplexworkflow).c_str()); + cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clust).c_str()); + cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); + + cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); + + std::string program = tmpDir + "/easycomplexcluster.sh"; + FileUtil::writeFile(program, easycomplexcluster_sh, easycomplexcluster_sh_len); + cmd.execProgram(program.c_str(), par.filenames); + + + // Should never get here + assert(false); + return EXIT_FAILURE; +} \ No newline at end of file diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index 146e8743..aa2fcd45 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -12,12 +12,12 @@ void setEasyComplexClusterDefaults(Parameters *p) { //TODO, parameters for search, filtercomplex, cluster, createresults p->PARAM_C = 0.8; p->PARAM_COV_MODE = 1; - p->sensitivity = 4; + p->PARAM_S = 4; p->PARAM_CLUSTER_MODE = Parameters::GREEDY; - p->evalThr = 0.001; - p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; - p->gapOpen = 10; - p->gapExtend = 1; + p->PARAM_E = 0.001; + p->PARAM_ALIGNMENT_MODE = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; + p->PARAM_GAP_OPEN = 10; + p->PARAM_GAP_EXTEND = 1; } void setEasyComplexClusterMustPassAlong(Parameters *p) { @@ -25,7 +25,7 @@ void setEasyComplexClusterMustPassAlong(Parameters *p) { p->PARAM_E.wasSet = true; p->PARAM_S.wasSet = true; p->PARAM_ALIGNMENT_MODE.wasSet = true; - p->addBacktrace = true; + p->PARAM_ADD_BACKTRACE = true; p->PARAM_ADD_BACKTRACE.wasSet = true; } @@ -34,7 +34,7 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); - + par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); for (size_t i = 0; i < par.createdb.size(); i++){ par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); } @@ -60,10 +60,9 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { cmd.addVariable("INPUT", par.filenames.back().c_str()); par.filenames.pop_back(); - cmd.addVariable("CLUSTER_MODULE", "filtercomplex"); + cmd.addVariable("CLUSTER_MODULE", "complexcluster"); cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); - cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow).c_str()); - cmd.addVariable("FILTERCOMPLEX_PAR", par.createParameterString(par.filtercomplexworkflow).c_str()); + cmd.addVariable("COMPLEXCLUSTER_PAR", par.createParameterString(par.complexclusterworkflow).c_str()); cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("RESULT2REPSEQ_PAR", par.createParameterString(par.result2repseq).c_str()); cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); From aebd3fd8f090acce5feb7f1d4e1bcdc9b891d45d Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 27 Feb 2024 18:52:34 +0900 Subject: [PATCH 030/160] small changes --- data/complexcluster.sh | 5 ++- data/easycomplexcluster.sh | 15 +++---- data/filtercomplex.sh | 70 ----------------------------- src/FoldseekBase.cpp | 4 +- src/commons/LocalParameters.cpp | 1 + src/workflow/ComplexCluster.cpp | 4 +- src/workflow/EasyComplexCluster.cpp | 5 +-- src/workflow/FilterComplex.cpp | 14 ------ 8 files changed, 17 insertions(+), 101 deletions(-) delete mode 100644 data/filtercomplex.sh delete mode 100644 src/workflow/FilterComplex.cpp diff --git a/data/complexcluster.sh b/data/complexcluster.sh index 2753496e..3204a5c0 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -19,6 +19,7 @@ exists() { [ -f "$2.dbtype" ] && echo "$2.dbtype already exists!" && exit 1; [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; +INPUT="$1" # DOING : complexsearch if notExists "${TMP_PATH}/complex_result.dbtype"; then # shellcheck disable=SC2086 @@ -38,12 +39,12 @@ if notExists "${TMP_PATH}/cmpl_db.dbtype"; then buildCmplDb "${INPUT}" "${TMP_PATH}/cmpl_db" fi -INPUT2="${TMP_PATH}/cmpl_db" +INPUTT="${TMP_PATH}/cmpl_db" # DOING : clust if notExists "${RESULT}.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" clust "${INPUT2}" "${RESULT}_filt" "${RESULT}" ${CLUSTER_PAR} \ + "$MMSEQS" clust "${INPUTT}" "${RESULT}_filt" "${RESULT}" ${CLUSTER_PAR} \ || fail "Clustering died" fi diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 37b60e1c..92e4726f 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -22,7 +22,6 @@ if notExists "${TMP_PATH}/input.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/input" ${CREATEDB_PAR} \ || fail "input createdb died" - fi fi INPUT="${TMP_PATH}/input" @@ -31,12 +30,12 @@ if notExists "${TMP_PATH}/cmpl_db.dbtype"; then $MMSEQS complexcluster "${INPUT}" "${RESULT}" "${TMP_PATH}" ${COMPLEXCLUSTER_PAR} \ || fail "Complexcluster died" fi -INPUT2="${TMP_PATH}/cmpl" +INPUTT="${TMP_PATH}/cmpl" # DOING : make tsv file if notExists "${TMP_PATH}/cluster.tsv"; then # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${INPUT2}" "${INPUT2}" "${RESULT}" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + "$MMSEQS" createtsv "${INPUTT}" "${INPUTT}" "${RESULT}" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ || fail "Convert Alignments died" fi @@ -44,22 +43,22 @@ fi # TODO: figure out how to represent complex sequences as a single fasta entry? if notExists "${TMP_PATH}/rep_seq.fasta"; then # shellcheck disable=SC2086 - "$MMSEQS" result2repseq "${INPUT2}" "${RESULT}" "${TMP_PATH}/clu_rep" ${RESULT2REPSEQ_PAR} \ + "$MMSEQS" result2repseq "${INPUTT}" "${RESULT}" "${TMP_PATH}/clu_rep" ${RESULT2REPSEQ_PAR} \ || fail "Result2repseq died" # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT2}" "${INPUT2}" "${TMP_PATH}/clu_rep" "${TMP_PATH}/rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ + "$MMSEQS" result2flat "${INPUTT}" "${INPUTT}" "${TMP_PATH}/clu_rep" "${TMP_PATH}/rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ || fail "result2flat died" fi # FIXME : make all_seq.fasta, and how ? if notExists "${TMP_PATH}/all_seqs.fasta"; then # shellcheck disable=SC2086 - "$MMSEQS" createseqfiledb "${INPUT2}" "${RESULT}" "${TMP_PATH}/clu_seqs" ${THREADS_PAR} \ + "$MMSEQS" createseqfiledb "${INPUTT}" "${RESULT}" "${TMP_PATH}/clu_seqs" ${THREADS_PAR} \ || fail "Result2repseq died" # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT2}" "${INPUT2}" "${TMP_PATH}/clu_seqs" "${TMP_PATH}/all_seqs.fasta" ${VERBOSITY_PAR} \ + "$MMSEQS" result2flat "${INPUTT}" "${INPUTT}" "${TMP_PATH}/clu_seqs" "${TMP_PATH}/all_seqs.fasta" ${VERBOSITY_PAR} \ || fail "result2flat died" fi @@ -78,7 +77,7 @@ if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/clu_rep" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${INPUT2}" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${INPUTT}" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${RESULT}" ${VERBOSITY_PAR} rm -f "${TMP_PATH}/easycomplexcluster.sh" diff --git a/data/filtercomplex.sh b/data/filtercomplex.sh deleted file mode 100644 index b6c425b5..00000000 --- a/data/filtercomplex.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/sh -e -#TODO: maybe change file name into filtercomplex.sh -fail() { - echo "Error: $1" - exit 1 -} - -notExists() { - [ ! -f "$1" ] -} - -exists() { - [ -f "$1" ] -} - -abspath() { - if [ -d "$1" ]; then - (cd "$1"; pwd) - elif [ -f "$1" ]; then - if [ -z "${1##*/*}" ]; then - echo "$(cd "${1%/*}"; pwd)/${1##*/}" - else - echo "$(pwd)/$1" - fi - elif [ -d "$(dirname "$1")" ]; then - echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" - fi -} - -# Shift initial DB to complexDB using soft-linking -# $1: input db -# $2: output db -buildCmplDb() { - touch "${2}" - awk -F"\t" 'BEGIN {OFFSET=0} - FNR==NR{chain_len[$1]=$3;next} - { - if (!($3 in off_arr)) { - off_arr[$3]=OFFSET - } - cmpl_len[$3]+=chain_len[$1];OFFSET+=chain_len[$1] - } - END { - for (cmpl in off_arr) { - print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] - } - }' "${1}.index" "${1}.lookup" > "${2}.index" - ln -s "$(abspath "${1}")" "${2}.0" - cp "${1}.dbtype" "${2}.dbtype" -} - -# check number of input variables -[ "$#" -ne 4 ] && echo "Please provide " && exit 1; -# check if files exist -[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; -[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1; -[ ! -f "$3.dbtype" ] && echo "$3.dbtype not found!" && exit 1; -[ -f "$4.dbtype" ] && echo "$4.dbtype exists already!" && exit 1; - -if notExists "$4"; then - # shellcheck disable=SC2086 - $MMSEQS filtercomplex "$1" "$2" "$3" "$4" ${FILTERCOMPLEX_PAR} \ - || fail "FilterComplex died" -fi - -if notExists "${CMPLDB_PATH}/cmpl_db.dbtype"; then - buildCmplDb "${SOURCE}" "${CMPLDB_PATH}/cmpl_db" -fi - -# DONE : remove tmp -> No TMP file generated \ No newline at end of file diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 981c5a83..6916eeb8 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -310,10 +310,10 @@ std::vector foldseekCommands = { "# -c 0.7 - + -\n" "# -c 0.6 + + +\n\n" "Seongeun Kim & Sooyoung Cha ", - " ... ", + " ... ", CITATION_FOLDSEEK, { {"PDB|mmCIF[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &FoldSeekDbValidator::flatfileStdinAndFolder}, - {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, + {"outputFileName", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } } }, diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 657fc0ad..b1c6348c 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -198,6 +198,7 @@ LocalParameters::LocalParameters() : // complexclusterworkflow complexclusterworkflow = combineList(complexsearchworkflow, filtercomplex); complexclusterworkflow = combineList(complexclusterworkflow, clust); + complexclusterworkflow.push_back(&PARAM_C) //easycomplexclusterworkflow easycomplexclusterworkflow = combineList(structurecreatedb, complexclusterworkflow); diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index 5da3d5e0..4daa9721 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -16,8 +16,8 @@ void setEasyComplexClusterDefaults(Parameters *p) { p->PARAM_CLUSTER_MODE = Parameters::GREEDY; p->PARAM_E = 0.001; p->PARAM_ALIGNMENT_MODE = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; - p->PARAM_GAP_OPEN = 10; - p->PARAM_GAP_EXTEND = 1; + p->gapOpen = 10; + p->gapExtend = 1; } void setEasyComplexClusterMustPassAlong(Parameters *p) { diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index aa2fcd45..152e44ba 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -16,8 +16,8 @@ void setEasyComplexClusterDefaults(Parameters *p) { p->PARAM_CLUSTER_MODE = Parameters::GREEDY; p->PARAM_E = 0.001; p->PARAM_ALIGNMENT_MODE = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; - p->PARAM_GAP_OPEN = 10; - p->PARAM_GAP_EXTEND = 1; + p->gapOpen = 10; + p->gapExtend = 1; } void setEasyComplexClusterMustPassAlong(Parameters *p) { @@ -60,7 +60,6 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { cmd.addVariable("INPUT", par.filenames.back().c_str()); par.filenames.pop_back(); - cmd.addVariable("CLUSTER_MODULE", "complexcluster"); cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); cmd.addVariable("COMPLEXCLUSTER_PAR", par.createParameterString(par.complexclusterworkflow).c_str()); cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); diff --git a/src/workflow/FilterComplex.cpp b/src/workflow/FilterComplex.cpp deleted file mode 100644 index d7a26a04..00000000 --- a/src/workflow/FilterComplex.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include - -#include "FileUtil.h" -#include "CommandCaller.h" -#include "Util.h" -#include "Debug.h" -#include "LocalParameters.h" - -#include "filtercluster.sh.h" - -int complexcluster(int argc, const char **argv, const Command &command) -{ - return 0; -} \ No newline at end of file From dd34d67058b6873e8cb651339c25d0447762ad7a Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 27 Feb 2024 20:08:19 +0900 Subject: [PATCH 031/160] still build failed --- data/easycomplexcluster.sh | 2 +- src/FoldseekBase.cpp | 2 +- src/commons/LocalParameters.cpp | 2 +- src/workflow/ComplexCluster.cpp | 29 +++++++++++++++-------------- src/workflow/EasyComplexCluster.cpp | 15 ++++++++------- 5 files changed, 26 insertions(+), 24 deletions(-) diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 92e4726f..d3af8910 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -27,7 +27,7 @@ INPUT="${TMP_PATH}/input" # DOING : complexcluster if notExists "${TMP_PATH}/cmpl_db.dbtype"; then - $MMSEQS complexcluster "${INPUT}" "${RESULT}" "${TMP_PATH}" ${COMPLEXCLUSTER_PAR} \ + $MMSEQS complexcluster "${INPUT}" "${RESULT}" "${TMP_PATH}" "${COMPLEXCLUSTER_PAR}" \ || fail "Complexcluster died" fi INPUTT="${TMP_PATH}/cmpl" diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 6916eeb8..6dfbe641 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -263,7 +263,7 @@ std::vector foldseekCommands = { {"complexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb} } }, - {"filtercomplex", filtercomplex, &localPar.filtercomplexworkflow, COMMAND_HIDDEN, + {"filtercomplex", filtercomplex, &localPar.filtercomplex, COMMAND_HIDDEN, "Filters complexes and generates a new complexDB", //FIX: explain about output complexDB+clustDB? "foldseek filtercomplex queryDB targetDB alignmentDB complexDB -c 0.8 --cov-mode 1\n", "Seongeun Kim & Sooyoung Cha ", diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index b1c6348c..123799b7 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -198,7 +198,7 @@ LocalParameters::LocalParameters() : // complexclusterworkflow complexclusterworkflow = combineList(complexsearchworkflow, filtercomplex); complexclusterworkflow = combineList(complexclusterworkflow, clust); - complexclusterworkflow.push_back(&PARAM_C) + complexclusterworkflow.push_back(&PARAM_C); //easycomplexclusterworkflow easycomplexclusterworkflow = combineList(structurecreatedb, complexclusterworkflow); diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index 4daa9721..3a5cc89e 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -8,24 +8,24 @@ #include "complexcluster.sh.h" -void setEasyComplexClusterDefaults(Parameters *p) { +void setComplexClusterDefaults(Parameters *p) { //TODO, parameters for search, filtercomplex, cluster, createresults - p->PARAM_C = 0.8; - p->PARAM_COV_MODE = 1; - p->PARAM_S = 4; - p->PARAM_CLUSTER_MODE = Parameters::GREEDY; - p->PARAM_E = 0.001; - p->PARAM_ALIGNMENT_MODE = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; + p->covThr = 0.8; + p->covMode = 1; + p->sensitivity = 4; + p->clusteringMode = Parameters::GREEDY; + p->evalThr = 0.001; + p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; p->gapOpen = 10; p->gapExtend = 1; } -void setEasyComplexClusterMustPassAlong(Parameters *p) { +void setComplexClusterMustPassAlong(Parameters *p) { p->PARAM_C.wasSet = true; p->PARAM_E.wasSet = true; p->PARAM_S.wasSet = true; p->PARAM_ALIGNMENT_MODE.wasSet = true; - p->PARAM_ADD_BACKTRACE = true; + p->addBacktrace = true; p->PARAM_ADD_BACKTRACE.wasSet = true; } @@ -44,9 +44,9 @@ int complexcluster(int argc, const char **argv, const Command &command) par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); - setEasyComplexClusterDefaults(&par); + setComplexClusterDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); - setEasyComplexClusterMustPassAlong(&par); + setComplexClusterMustPassAlong(&par); std::string tmpDir = par.filenames.back(); std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); @@ -56,6 +56,7 @@ int complexcluster(int argc, const char **argv, const Command &command) tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); par.filenames.pop_back(); + CommandCaller cmd; cmd.addVariable("TMP_PATH", tmpDir.c_str()); cmd.addVariable("RESULT", par.filenames.back().c_str()); par.filenames.pop_back(); @@ -63,14 +64,14 @@ int complexcluster(int argc, const char **argv, const Command &command) par.filenames.pop_back(); cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow).c_str()); - cmd.addVariable("FILTERCOMPLEX_PAR", par.createParameterString(par.filtercomplexworkflow).c_str()); + cmd.addVariable("FILTERCOMPLEX_PAR", par.createParameterString(par.filtercomplex).c_str()); cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clust).c_str()); cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); - std::string program = tmpDir + "/easycomplexcluster.sh"; - FileUtil::writeFile(program, easycomplexcluster_sh, easycomplexcluster_sh_len); + std::string program = tmpDir + "/complexcluster.sh"; + FileUtil::writeFile(program, complexcluster_sh, complexcluster_sh_len); cmd.execProgram(program.c_str(), par.filenames); diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index 152e44ba..e6728c83 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -10,12 +10,12 @@ void setEasyComplexClusterDefaults(Parameters *p) { //TODO, parameters for search, filtercomplex, cluster, createresults - p->PARAM_C = 0.8; - p->PARAM_COV_MODE = 1; - p->PARAM_S = 4; - p->PARAM_CLUSTER_MODE = Parameters::GREEDY; - p->PARAM_E = 0.001; - p->PARAM_ALIGNMENT_MODE = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; + p->covThr = 0.8; + p->covMode = 1; + p->sensitivity = 4; + p->clusteringMode = Parameters::GREEDY; + p->evalThr = 0.001; + p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; p->gapOpen = 10; p->gapExtend = 1; } @@ -25,7 +25,7 @@ void setEasyComplexClusterMustPassAlong(Parameters *p) { p->PARAM_E.wasSet = true; p->PARAM_S.wasSet = true; p->PARAM_ALIGNMENT_MODE.wasSet = true; - p->PARAM_ADD_BACKTRACE = true; + p->addBacktrace = true; p->PARAM_ADD_BACKTRACE.wasSet = true; } @@ -54,6 +54,7 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); par.filenames.pop_back(); + CommandCaller cmd; cmd.addVariable("TMP_PATH", tmpDir.c_str()); cmd.addVariable("RESULT", par.filenames.back().c_str()); par.filenames.pop_back(); From 6db40b57ffa9bc84bf877817177d9f6fdebe5175 Mon Sep 17 00:00:00 2001 From: rachelse Date: Wed, 28 Feb 2024 09:21:14 +0900 Subject: [PATCH 032/160] tmp LocalParameters.cpp --- src/commons/LocalParameters.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index b1c6348c..9577aa40 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -198,11 +198,10 @@ LocalParameters::LocalParameters() : // complexclusterworkflow complexclusterworkflow = combineList(complexsearchworkflow, filtercomplex); complexclusterworkflow = combineList(complexclusterworkflow, clust); - complexclusterworkflow.push_back(&PARAM_C) //easycomplexclusterworkflow easycomplexclusterworkflow = combineList(structurecreatedb, complexclusterworkflow); - easycomplexclusterworkflow = combineList(easycomplexclusterworkflow, convertalignments); + easycomplexclusterworkflow = combineList(easycomplexclusterworkflow, result2repseq); // expandcomplex expandcomplex.push_back(&PARAM_THREADS); From 8dd17b2440cfd3636dcd6b07ac35281ecd11d424 Mon Sep 17 00:00:00 2001 From: rachelse Date: Wed, 28 Feb 2024 13:45:08 +0900 Subject: [PATCH 033/160] Organized shell scripts --- data/complexcluster.sh | 68 +++++++++++++++++++++++---------- data/easycomplexcluster.sh | 45 ++++++++++------------ src/FoldseekBase.cpp | 15 ++++---- src/commons/LocalParameters.cpp | 4 -- 4 files changed, 75 insertions(+), 57 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index 3204a5c0..11a849d0 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -12,6 +12,42 @@ exists() { [ -f "$1" ] } +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +# Shift initial DB to complexDB using soft-linking +# $1: input db +# $2: output db +buildCmplDb() { + touch "${2}" + awk -F"\t" 'BEGIN {OFFSET=0} + FNR==NR{chain_len[$1]=$3;next} + { + if (!($3 in off_arr)) { + off_arr[$3]=OFFSET + } + cmpl_len[$3]+=chain_len[$1];OFFSET+=chain_len[$1] + } + END { + for (cmpl in off_arr) { + print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] + } + }' "${1}.index" "${1}.lookup" > "${2}.index" + ln -s "$(abspath "${1}")" "${2}.0" + cp "${1}.dbtype" "${2}.dbtype" +} + # check number of input variables [ "$#" -ne 3 ] && echo "Please provide " && exit 1; # check if files exist @@ -19,42 +55,34 @@ exists() { [ -f "$2.dbtype" ] && echo "$2.dbtype already exists!" && exit 1; [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; -INPUT="$1" -# DOING : complexsearch if notExists "${TMP_PATH}/complex_result.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" complexsearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complexsearch_aln" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \ + "$MMSEQS" complexsearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \ || fail "ComplexSearch died" fi -# DOING : filtercomplex -if notExists "${RESULT}_filt.dbtype"; then +if notExists "complex_db.dbtype"; then # shellcheck disable=SC2086 - $MMSEQS filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complexsearch_aln" "${RESULT}_filt" ${FILTERCOMPLEX_PAR} \ + $MMSEQS filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ || fail "FilterComplex died" + + # build complex db as output + buildCmplDb "${INPUT}" "${TMP_PATH}/complex_db" fi +SOURCE=$INPUT +INPUT="${TMP_PATH}/complex_db" -# DOING : softlink source to complexDB -if notExists "${TMP_PATH}/cmpl_db.dbtype"; then - buildCmplDb "${INPUT}" "${TMP_PATH}/cmpl_db" -fi - -INPUTT="${TMP_PATH}/cmpl_db" - -# DOING : clust if notExists "${RESULT}.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" clust "${INPUTT}" "${RESULT}_filt" "${RESULT}" ${CLUSTER_PAR} \ + "$MMSEQS" clust "${INPUT}" "${TMP_PATH}/complex_filt" "${RESULT}" ${CLUSTER_PAR} \ || fail "Clustering died" fi - -# DOING: Remove tmp if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${RESULT}_filt" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/complex_filt" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complexsearch_aln" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/complex_result" ${VERBOSITY_PAR} rm -rf "${TMP_PATH}/complexsearch_tmp" - rm -f "${TMP_PATH}/easycomplexcluster.sh" + rm -f "${TMP_PATH}/complexcluster.sh" fi \ No newline at end of file diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index d3af8910..5f0d1665 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -14,71 +14,66 @@ exists() { # check number of input variables [ "$#" -ne 3 ] && echo "Please provide " && exit 1; -# check if files exist -[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; +# REVIEW: TMP_DIR was already made by easycomplexcluster.cpp # check if files exist +# [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; -# DOING : createdb if notExists "${TMP_PATH}/input.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/input" ${CREATEDB_PAR} \ || fail "input createdb died" fi -INPUT="${TMP_PATH}/input" -# DOING : complexcluster -if notExists "${TMP_PATH}/cmpl_db.dbtype"; then - $MMSEQS complexcluster "${INPUT}" "${RESULT}" "${TMP_PATH}" "${COMPLEXCLUSTER_PAR}" \ +if notExists "${TMP_PATH}/complex_clust.dbtype"; then + $MMSEQS complexcluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clust" "${TMP_PATH}" "${COMPLEXCLUSTER_PAR}" \ || fail "Complexcluster died" fi -INPUTT="${TMP_PATH}/cmpl" -# DOING : make tsv file +# TODO: copmlex_db need header/lookup? +SOURCE=$INPUT +INPUT="${TMP_PATH}/complex_db" if notExists "${TMP_PATH}/cluster.tsv"; then # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${INPUTT}" "${INPUTT}" "${RESULT}" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ || fail "Convert Alignments died" fi -# FIXME : make rep_seq.fasta # TODO: figure out how to represent complex sequences as a single fasta entry? -if notExists "${TMP_PATH}/rep_seq.fasta"; then +if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then # shellcheck disable=SC2086 - "$MMSEQS" result2repseq "${INPUTT}" "${RESULT}" "${TMP_PATH}/clu_rep" ${RESULT2REPSEQ_PAR} \ + "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_rep" ${RESULT2REPSEQ_PAR} \ || fail "Result2repseq died" # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUTT}" "${INPUTT}" "${TMP_PATH}/clu_rep" "${TMP_PATH}/rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ + "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_rep" "${TMP_PATH}/complex_rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ || fail "result2flat died" fi -# FIXME : make all_seq.fasta, and how ? -if notExists "${TMP_PATH}/all_seqs.fasta"; then +if notExists "${TMP_PATH}/complex_all_seqs.fasta"; then # shellcheck disable=SC2086 - "$MMSEQS" createseqfiledb "${INPUTT}" "${RESULT}" "${TMP_PATH}/clu_seqs" ${THREADS_PAR} \ + "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_seqs" ${THREADS_PAR} \ || fail "Result2repseq died" # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUTT}" "${INPUTT}" "${TMP_PATH}/clu_seqs" "${TMP_PATH}/all_seqs.fasta" ${VERBOSITY_PAR} \ + "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_seqs" "${TMP_PATH}/complex_all_seqs.fasta" ${VERBOSITY_PAR} \ || fail "result2flat died" fi -mv "${TMP_PATH}/all_seqs.fasta" "${RESULT}_all_seqs.fasta" -mv "${TMP_PATH}/rep_seq.fasta" "${RESULT}_rep_seq.fasta" +mv "${TMP_PATH}/complex_all_seqs.fasta" "${RESULT}_all_seqs.fasta" +mv "${TMP_PATH}/complex_rep_seq.fasta" "${RESULT}_rep_seq.fasta" mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" -# TODO : remove tmp -> tide up and organize if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/clu_seqs" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/complex_db" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/clu_rep" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_seqs" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${INPUTT}" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_rep" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${RESULT}" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} rm -f "${TMP_PATH}/easycomplexcluster.sh" fi \ No newline at end of file diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 6dfbe641..8fb7a769 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -264,7 +264,7 @@ std::vector foldseekCommands = { } }, {"filtercomplex", filtercomplex, &localPar.filtercomplex, COMMAND_HIDDEN, - "Filters complexes and generates a new complexDB", //FIX: explain about output complexDB+clustDB? + "Filters complexes satisfying given coverage", "foldseek filtercomplex queryDB targetDB alignmentDB complexDB -c 0.8 --cov-mode 1\n", "Seongeun Kim & Sooyoung Cha ", " ", @@ -277,7 +277,7 @@ std::vector foldseekCommands = { }, {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_MAIN, "Complex level cluster", - "foldseek complexcluster queryDB result tmp\n" + "foldseek complexcluster queryDB result tmp\n" "# Cluster output\n" "# Important parameter: --cov-mode and -c \n" "# --cov-mode \n" @@ -287,18 +287,17 @@ std::vector foldseekCommands = { "# -c 0.7 - + -\n" "# -c 0.6 + + +\n\n" "Seongeun Kim & Sooyoung Cha ", - " ", + " ", CITATION_FOLDSEEK, { - {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb}, + {"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb}, {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } } }, {"easy-complexcluster", easycomplexcluster, &localPar.easycomplexclusterworkflow, COMMAND_EASY, "Complex level cluster", - "foldseek easy-complexcluster example/1tim.pdb.gz result tmp\n" + "foldseek easy-complexcluster examples/ result tmp\n" "# Cluster output\n" - "FIX ME" "# - result_rep_seq.fasta: Representatives\n" "# - result_all_seq.fasta: FASTA-like per cluster\n" "# - result_cluster.tsv: Adjacency list\n\n" @@ -310,10 +309,10 @@ std::vector foldseekCommands = { "# -c 0.7 - + -\n" "# -c 0.6 + + +\n\n" "Seongeun Kim & Sooyoung Cha ", - " ... ", + " ... ", CITATION_FOLDSEEK, { {"PDB|mmCIF[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &FoldSeekDbValidator::flatfileStdinAndFolder}, - {"outputFileName", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}, + {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } } }, diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 65196d64..9577aa40 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -198,10 +198,6 @@ LocalParameters::LocalParameters() : // complexclusterworkflow complexclusterworkflow = combineList(complexsearchworkflow, filtercomplex); complexclusterworkflow = combineList(complexclusterworkflow, clust); -<<<<<<< HEAD -======= - complexclusterworkflow.push_back(&PARAM_C); ->>>>>>> dd34d67058b6873e8cb651339c25d0447762ad7a //easycomplexclusterworkflow easycomplexclusterworkflow = combineList(structurecreatedb, complexclusterworkflow); From c3a7c959fea8d681ef5cca1bedba9575ec60759a Mon Sep 17 00:00:00 2001 From: rachelse Date: Wed, 28 Feb 2024 16:03:40 +0900 Subject: [PATCH 034/160] [TODO] Solve conflicts during make --- data/complexcluster.sh | 17 +++--- data/easycomplexcluster.sh | 80 ++++++++++++++--------------- src/workflow/ComplexCluster.cpp | 61 ++++++++++++---------- src/workflow/EasyComplexCluster.cpp | 40 ++++++--------- 4 files changed, 101 insertions(+), 97 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index 11a849d0..7e638393 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -49,12 +49,17 @@ buildCmplDb() { } # check number of input variables -[ "$#" -ne 3 ] && echo "Please provide " && exit 1; +# [ "$#" -ne 3 ] && echo "Please provide " && exit 1; # check if files exist -[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; -[ -f "$2.dbtype" ] && echo "$2.dbtype already exists!" && exit 1; +# [ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; +# [ -f "$2.dbtype" ] && echo "$2.dbtype already exists!" && exit 1; [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; +INPUT=$1 +RESULT=$2 +TMP_PATH=$3 +SOURCE=$INPUT + if notExists "${TMP_PATH}/complex_result.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" complexsearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \ @@ -69,12 +74,12 @@ if notExists "complex_db.dbtype"; then # build complex db as output buildCmplDb "${INPUT}" "${TMP_PATH}/complex_db" fi -SOURCE=$INPUT -INPUT="${TMP_PATH}/complex_db" + +INPUT2="${TMP_PATH}/complex_db" if notExists "${RESULT}.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" clust "${INPUT}" "${TMP_PATH}/complex_filt" "${RESULT}" ${CLUSTER_PAR} \ + "$MMSEQS" clust "${INPUT2}" "${TMP_PATH}/complex_filt" "${RESULT}" ${CLUSTER_PAR} \ || fail "Clustering died" fi diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 5f0d1665..83d7313f 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -13,7 +13,7 @@ exists() { } # check number of input variables -[ "$#" -ne 3 ] && echo "Please provide " && exit 1; +# [ "$#" -ne 3 ] && echo "Please provide " && exit 1; # REVIEW: TMP_DIR was already made by easycomplexcluster.cpp # check if files exist # [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; @@ -31,49 +31,49 @@ fi # TODO: copmlex_db need header/lookup? SOURCE=$INPUT INPUT="${TMP_PATH}/complex_db" -if notExists "${TMP_PATH}/cluster.tsv"; then - # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ - || fail "Convert Alignments died" -fi +# if notExists "${TMP_PATH}/cluster.tsv"; then +# # shellcheck disable=SC2086 +# "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ +# || fail "Convert Alignments died" +# fi # TODO: figure out how to represent complex sequences as a single fasta entry? -if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then - # shellcheck disable=SC2086 - "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_rep" ${RESULT2REPSEQ_PAR} \ - || fail "Result2repseq died" +# if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then +# # shellcheck disable=SC2086 +# "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_rep" ${RESULT2REPSEQ_PAR} \ +# || fail "Result2repseq died" - # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_rep" "${TMP_PATH}/complex_rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ - || fail "result2flat died" -fi +# # shellcheck disable=SC2086 +# "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_rep" "${TMP_PATH}/complex_rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ +# || fail "result2flat died" +# fi -if notExists "${TMP_PATH}/complex_all_seqs.fasta"; then - # shellcheck disable=SC2086 - "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_seqs" ${THREADS_PAR} \ - || fail "Result2repseq died" +# if notExists "${TMP_PATH}/complex_all_seqs.fasta"; then +# # shellcheck disable=SC2086 +# "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_seqs" ${THREADS_PAR} \ +# || fail "Result2repseq died" - # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_seqs" "${TMP_PATH}/complex_all_seqs.fasta" ${VERBOSITY_PAR} \ - || fail "result2flat died" -fi +# # shellcheck disable=SC2086 +# "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_seqs" "${TMP_PATH}/complex_all_seqs.fasta" ${VERBOSITY_PAR} \ +# || fail "result2flat died" +# fi -mv "${TMP_PATH}/complex_all_seqs.fasta" "${RESULT}_all_seqs.fasta" -mv "${TMP_PATH}/complex_rep_seq.fasta" "${RESULT}_rep_seq.fasta" -mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" +# mv "${TMP_PATH}/complex_all_seqs.fasta" "${RESULT}_all_seqs.fasta" +# mv "${TMP_PATH}/complex_rep_seq.fasta" "${RESULT}_rep_seq.fasta" +# mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" -if [ -n "${REMOVE_TMP}" ]; then - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_db" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_seqs" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_rep" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} - rm -f "${TMP_PATH}/easycomplexcluster.sh" -fi \ No newline at end of file +# if [ -n "${REMOVE_TMP}" ]; then +# # shellcheck disable=SC2086 +# "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} +# # shellcheck disable=SC2086 +# "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} +# # shellcheck disable=SC2086 +# "$MMSEQS" rmdb "${TMP_PATH}/complex_db" ${VERBOSITY_PAR} +# # shellcheck disable=SC2086 +# "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_seqs" ${VERBOSITY_PAR} +# # shellcheck disable=SC2086 +# "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_rep" ${VERBOSITY_PAR} +# # shellcheck disable=SC2086 +# "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} +# rm -f "${TMP_PATH}/easycomplexcluster.sh" +# fi \ No newline at end of file diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index 3a5cc89e..533e5d3f 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -12,12 +12,13 @@ void setComplexClusterDefaults(Parameters *p) { //TODO, parameters for search, filtercomplex, cluster, createresults p->covThr = 0.8; p->covMode = 1; - p->sensitivity = 4; p->clusteringMode = Parameters::GREEDY; - p->evalThr = 0.001; - p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; - p->gapOpen = 10; - p->gapExtend = 1; + + // p->sensitivity = 4; + // p->evalThr = 0.001; + // p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; + // p->gapOpen = 10; + // p->gapExtend = 1; } void setComplexClusterMustPassAlong(Parameters *p) { @@ -29,52 +30,56 @@ void setComplexClusterMustPassAlong(Parameters *p) { p->PARAM_ADD_BACKTRACE.wasSet = true; } -int complexcluster(int argc, const char **argv, const Command &command) -{ +int complexcluster(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); - par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); + # TODO : figure out if commented params needed + // par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); - for (size_t i = 0; i < par.createdb.size(); i++){ - par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); - } - par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + par.printParameters(command.cmd, argc, argv, *command.params); + //TODO setComplexClusterDefaults(&par); - par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); - setComplexClusterMustPassAlong(&par); + // setComplexClusterMustPassAlong(&par); std::string tmpDir = par.filenames.back(); - std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, par.complexclusterworkflow)); if (par.reuseLatest) { hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); } tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); par.filenames.pop_back(); + par.filenames.push_back(tmpDir); CommandCaller cmd; - cmd.addVariable("TMP_PATH", tmpDir.c_str()); - cmd.addVariable("RESULT", par.filenames.back().c_str()); - par.filenames.pop_back(); - cmd.addVariable("INPUT", par.filenames.back().c_str()); - par.filenames.pop_back(); + + // REVIEW : the variables below are declared in the complexcluster.sh + // cmd.addVariable("TMP_PATH", tmpDir.c_str()); + // cmd.addVariable("RESULT", par.filenames.back().c_str()); + // par.filenames.pop_back(); + // cmd.addVariable("INPUT", par.filenames.back().c_str()); + // par.filenames.pop_back(); cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow).c_str()); cmd.addVariable("FILTERCOMPLEX_PAR", par.createParameterString(par.filtercomplex).c_str()); cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clust).c_str()); - cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); - cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); + cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); std::string program = tmpDir + "/complexcluster.sh"; FileUtil::writeFile(program, complexcluster_sh, complexcluster_sh_len); cmd.execProgram(program.c_str(), par.filenames); - // Should never get here assert(false); return EXIT_FAILURE; diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index e6728c83..aa5e5d13 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -8,33 +8,28 @@ #include "easycomplexcluster.sh.h" +// REVIEW: Redundant code with src/workflow/ComplexCluster.cpp void setEasyComplexClusterDefaults(Parameters *p) { - //TODO, parameters for search, filtercomplex, cluster, createresults - p->covThr = 0.8; - p->covMode = 1; - p->sensitivity = 4; - p->clusteringMode = Parameters::GREEDY; - p->evalThr = 0.001; - p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; - p->gapOpen = 10; - p->gapExtend = 1; + p->removeTmpFiles = true; } void setEasyComplexClusterMustPassAlong(Parameters *p) { - p->PARAM_C.wasSet = true; - p->PARAM_E.wasSet = true; - p->PARAM_S.wasSet = true; - p->PARAM_ALIGNMENT_MODE.wasSet = true; - p->addBacktrace = true; - p->PARAM_ADD_BACKTRACE.wasSet = true; - + p->PARAM_REMOVE_TMP_FILES.wasSet = true; } + int easycomplexcluster(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); - par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); + // TODO : figure out if commented params needed + // par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_ALT_ALIGNMENT.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_KMER_PER_SEQ.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT); for (size_t i = 0; i < par.createdb.size(); i++){ par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); } @@ -61,19 +56,18 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { cmd.addVariable("INPUT", par.filenames.back().c_str()); par.filenames.pop_back(); + cmd.addVariable("RUNNER", par.runner.c_str()); cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); - cmd.addVariable("COMPLEXCLUSTER_PAR", par.createParameterString(par.complexclusterworkflow).c_str()); + cmd.addVariable("COMPLEXCLUSTER_PAR", par.createParameterString(par.complexclusterworkflow, true).c_str()); cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("RESULT2REPSEQ_PAR", par.createParameterString(par.result2repseq).c_str()); cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); - cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); std::string program = tmpDir + "/easycomplexcluster.sh"; FileUtil::writeFile(program, easycomplexcluster_sh, easycomplexcluster_sh_len); cmd.execProgram(program.c_str(), par.filenames); - // Should never get here assert(false); return EXIT_FAILURE; From 6ac16224daa0a4262433557c720011f523ce8acb Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 28 Feb 2024 18:03:18 +0900 Subject: [PATCH 035/160] finally make works --- data/complexcluster.sh | 2 +- data/easycomplexcluster.sh | 82 ++++++++++++++--------------- src/FoldseekBase.cpp | 23 +------- src/workflow/ComplexCluster.cpp | 35 ++++++------ src/workflow/EasyComplexCluster.cpp | 15 +----- 5 files changed, 60 insertions(+), 97 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index 7e638393..adfa4d35 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -58,7 +58,7 @@ buildCmplDb() { INPUT=$1 RESULT=$2 TMP_PATH=$3 -SOURCE=$INPUT +# SOURCE=$INPUT if notExists "${TMP_PATH}/complex_result.dbtype"; then # shellcheck disable=SC2086 diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 83d7313f..6e08d89b 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -29,51 +29,51 @@ if notExists "${TMP_PATH}/complex_clust.dbtype"; then fi # TODO: copmlex_db need header/lookup? -SOURCE=$INPUT +# SOURCE=$INPUT INPUT="${TMP_PATH}/complex_db" -# if notExists "${TMP_PATH}/cluster.tsv"; then -# # shellcheck disable=SC2086 -# "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ -# || fail "Convert Alignments died" -# fi +if notExists "${TMP_PATH}/cluster.tsv"; then + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + || fail "Convert Alignments died" +fi -# TODO: figure out how to represent complex sequences as a single fasta entry? -# if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then -# # shellcheck disable=SC2086 -# "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_rep" ${RESULT2REPSEQ_PAR} \ -# || fail "Result2repseq died" +TODO: figure out how to represent complex sequences as a single fasta entry? +if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then + # shellcheck disable=SC2086 + "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_rep" ${RESULT2REPSEQ_PAR} \ + || fail "Result2repseq died" -# # shellcheck disable=SC2086 -# "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_rep" "${TMP_PATH}/complex_rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ -# || fail "result2flat died" -# fi + # shellcheck disable=SC2086 + "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_rep" "${TMP_PATH}/complex_rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ + || fail "result2flat died" +fi -# if notExists "${TMP_PATH}/complex_all_seqs.fasta"; then -# # shellcheck disable=SC2086 -# "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_seqs" ${THREADS_PAR} \ -# || fail "Result2repseq died" +if notExists "${TMP_PATH}/complex_all_seqs.fasta"; then + # shellcheck disable=SC2086 + "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_seqs" ${THREADS_PAR} \ + || fail "Result2repseq died" -# # shellcheck disable=SC2086 -# "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_seqs" "${TMP_PATH}/complex_all_seqs.fasta" ${VERBOSITY_PAR} \ -# || fail "result2flat died" -# fi + # shellcheck disable=SC2086 + "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_seqs" "${TMP_PATH}/complex_all_seqs.fasta" ${VERBOSITY_PAR} \ + || fail "result2flat died" +fi -# mv "${TMP_PATH}/complex_all_seqs.fasta" "${RESULT}_all_seqs.fasta" -# mv "${TMP_PATH}/complex_rep_seq.fasta" "${RESULT}_rep_seq.fasta" -# mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" +mv "${TMP_PATH}/complex_all_seqs.fasta" "${RESULT}_all_seqs.fasta" +mv "${TMP_PATH}/complex_rep_seq.fasta" "${RESULT}_rep_seq.fasta" +mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" -# if [ -n "${REMOVE_TMP}" ]; then -# # shellcheck disable=SC2086 -# "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} -# # shellcheck disable=SC2086 -# "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} -# # shellcheck disable=SC2086 -# "$MMSEQS" rmdb "${TMP_PATH}/complex_db" ${VERBOSITY_PAR} -# # shellcheck disable=SC2086 -# "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_seqs" ${VERBOSITY_PAR} -# # shellcheck disable=SC2086 -# "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_rep" ${VERBOSITY_PAR} -# # shellcheck disable=SC2086 -# "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} -# rm -f "${TMP_PATH}/easycomplexcluster.sh" -# fi \ No newline at end of file +if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/complex_db" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_seqs" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_rep" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} + rm -f "${TMP_PATH}/easycomplexcluster.sh" +fi \ No newline at end of file diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 8fb7a769..051b7b33 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -277,15 +277,7 @@ std::vector foldseekCommands = { }, {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_MAIN, "Complex level cluster", - "foldseek complexcluster queryDB result tmp\n" - "# Cluster output\n" - "# Important parameter: --cov-mode and -c \n" - "# --cov-mode \n" - "# 0 1 2\n" - "# Q: MAVGTACRPA 60% IGN 60%\n" - "# T: -AVGTAC--- 60% 100% IGN\n" - "# -c 0.7 - + -\n" - "# -c 0.6 + + +\n\n" + "foldseek complexcluster queryDB result tmp\n", "Seongeun Kim & Sooyoung Cha ", " ", CITATION_FOLDSEEK, { @@ -296,18 +288,7 @@ std::vector foldseekCommands = { }, {"easy-complexcluster", easycomplexcluster, &localPar.easycomplexclusterworkflow, COMMAND_EASY, "Complex level cluster", - "foldseek easy-complexcluster examples/ result tmp\n" - "# Cluster output\n" - "# - result_rep_seq.fasta: Representatives\n" - "# - result_all_seq.fasta: FASTA-like per cluster\n" - "# - result_cluster.tsv: Adjacency list\n\n" - "# Important parameter: --cov-mode and -c \n" - "# --cov-mode \n" - "# 0 1 2\n" - "# Q: MAVGTACRPA 60% IGN 60%\n" - "# T: -AVGTAC--- 60% 100% IGN\n" - "# -c 0.7 - + -\n" - "# -c 0.6 + + +\n\n" + "foldseek easy-complexcluster examples/ result tmp\n", "Seongeun Kim & Sooyoung Cha ", " ... ", CITATION_FOLDSEEK, { diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index 533e5d3f..50b877a9 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -13,6 +13,7 @@ void setComplexClusterDefaults(Parameters *p) { p->covThr = 0.8; p->covMode = 1; p->clusteringMode = Parameters::GREEDY; + p->addBacktrace = true; // p->sensitivity = 4; // p->evalThr = 0.001; @@ -23,16 +24,15 @@ void setComplexClusterDefaults(Parameters *p) { void setComplexClusterMustPassAlong(Parameters *p) { p->PARAM_C.wasSet = true; - p->PARAM_E.wasSet = true; - p->PARAM_S.wasSet = true; - p->PARAM_ALIGNMENT_MODE.wasSet = true; - p->addBacktrace = true; p->PARAM_ADD_BACKTRACE.wasSet = true; + // p->PARAM_E.wasSet = true; + // p->PARAM_S.wasSet = true; + // p->PARAM_ALIGNMENT_MODE.wasSet = true; } int complexcluster(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); - # TODO : figure out if commented params needed + // TODO : figure out if commented params needed // par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); // par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); // par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT); @@ -45,32 +45,27 @@ int complexcluster(int argc, const char **argv, const Command &command) { // par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); // par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); - par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); - par.printParameters(command.cmd, argc, argv, *command.params); - - //TODO +\ setComplexClusterDefaults(&par); - // setComplexClusterMustPassAlong(&par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + setComplexClusterMustPassAlong(&par); std::string tmpDir = par.filenames.back(); - std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, par.complexclusterworkflow)); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); if (par.reuseLatest) { hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); } tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); par.filenames.pop_back(); - par.filenames.push_back(tmpDir); CommandCaller cmd; + cmd.addVariable("TMP_PATH", tmpDir.c_str()); + cmd.addVariable("RESULT", par.filenames.back().c_str()); + par.filenames.pop_back(); + cmd.addVariable("INPUT", par.filenames.back().c_str()); + par.filenames.pop_back(); - // REVIEW : the variables below are declared in the complexcluster.sh - // cmd.addVariable("TMP_PATH", tmpDir.c_str()); - // cmd.addVariable("RESULT", par.filenames.back().c_str()); - // par.filenames.pop_back(); - // cmd.addVariable("INPUT", par.filenames.back().c_str()); - // par.filenames.pop_back(); - - cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow).c_str()); + cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow, true).c_str()); cmd.addVariable("FILTERCOMPLEX_PAR", par.createParameterString(par.filtercomplex).c_str()); cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clust).c_str()); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index aa5e5d13..1d44dd4b 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -19,20 +19,7 @@ void setEasyComplexClusterMustPassAlong(Parameters *p) { int easycomplexcluster(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); - // TODO : figure out if commented params needed - // par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_ALT_ALIGNMENT.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_KMER_PER_SEQ.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT); - for (size_t i = 0; i < par.createdb.size(); i++){ - par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); - } + par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); From 7e054b6f04604019090f8351e34a1f800a3ea9e0 Mon Sep 17 00:00:00 2001 From: rachelse Date: Wed, 28 Feb 2024 18:31:17 +0900 Subject: [PATCH 036/160] [DONE] Build successed. [TODO] Default Parameter setting --- data/complexcluster.sh | 8 ++++---- data/easycomplexcluster.sh | 2 +- src/workflow/ComplexCluster.cpp | 6 ------ src/workflow/EasyComplexCluster.cpp | 5 +++-- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index adfa4d35..cd91eba3 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -49,10 +49,10 @@ buildCmplDb() { } # check number of input variables -# [ "$#" -ne 3 ] && echo "Please provide " && exit 1; -# check if files exist -# [ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; -# [ -f "$2.dbtype" ] && echo "$2.dbtype already exists!" && exit 1; +[ "$#" -ne 3 ] && echo "Please provide " && exit 1; +check if files exist +[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; +[ -f "$2.dbtype" ] && echo "$2.dbtype already exists!" && exit 1; [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; INPUT=$1 diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 6e08d89b..f19c9d9f 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -37,7 +37,7 @@ if notExists "${TMP_PATH}/cluster.tsv"; then || fail "Convert Alignments died" fi -TODO: figure out how to represent complex sequences as a single fasta entry? +#TODO: figure out how to represent complex sequences as a single fasta entry? if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then # shellcheck disable=SC2086 "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_rep" ${RESULT2REPSEQ_PAR} \ diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index 50b877a9..e7675a0a 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -45,7 +45,6 @@ int complexcluster(int argc, const char **argv, const Command &command) { // par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); // par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); -\ setComplexClusterDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); setComplexClusterMustPassAlong(&par); @@ -59,11 +58,6 @@ int complexcluster(int argc, const char **argv, const Command &command) { par.filenames.pop_back(); CommandCaller cmd; - cmd.addVariable("TMP_PATH", tmpDir.c_str()); - cmd.addVariable("RESULT", par.filenames.back().c_str()); - par.filenames.pop_back(); - cmd.addVariable("INPUT", par.filenames.back().c_str()); - par.filenames.pop_back(); cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow, true).c_str()); cmd.addVariable("FILTERCOMPLEX_PAR", par.createParameterString(par.filtercomplex).c_str()); diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index 1d44dd4b..7113ac86 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -8,7 +8,6 @@ #include "easycomplexcluster.sh.h" -// REVIEW: Redundant code with src/workflow/ComplexCluster.cpp void setEasyComplexClusterDefaults(Parameters *p) { p->removeTmpFiles = true; } @@ -19,7 +18,9 @@ void setEasyComplexClusterMustPassAlong(Parameters *p) { int easycomplexcluster(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); - + for (size_t i = 0; i < par.createdb.size(); i++){ + par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); + } par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); From 9b18390567392b43e0db3225159430c2182b5160 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 28 Feb 2024 18:54:42 +0900 Subject: [PATCH 037/160] share status --- data/testcc.sh | 93 +++++++++++++++++++++++++++++++++++++++++ src/workflow/testCC.cpp | 82 ++++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 data/testcc.sh create mode 100644 src/workflow/testCC.cpp diff --git a/data/testcc.sh b/data/testcc.sh new file mode 100644 index 00000000..7494035a --- /dev/null +++ b/data/testcc.sh @@ -0,0 +1,93 @@ +#!/bin/sh -e +fail() { + echo "Error: $1" + exit 1 +} + +notExists() { + [ ! -f "$1" ] +} + +exists() { + [ -f "$1" ] +} + +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +# Shift initial DB to complexDB using soft-linking +# $1: input db +# $2: output db +buildCmplDb() { + touch "${2}" + awk -F"\t" 'BEGIN {OFFSET=0} + FNR==NR{chain_len[$1]=$3;next} + { + if (!($3 in off_arr)) { + off_arr[$3]=OFFSET + } + cmpl_len[$3]+=chain_len[$1];OFFSET+=chain_len[$1] + } + END { + for (cmpl in off_arr) { + print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] + } + }' "${1}.index" "${1}.lookup" > "${2}.index" + ln -s "$(abspath "${1}")" "${2}.0" + cp "${1}.dbtype" "${2}.dbtype" +} + +# check number of input variables +# [ "$#" -ne 3 ] && echo "Please provide " && exit 1; +# check if files exist +# [ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; +# [ -f "$2.dbtype" ] && echo "$2.dbtype already exists!" && exit 1; + +# INPUT=$1 +# RESULT=$2 +# TMP_PATH=$3 + +[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; +# SOURCE=$INPUT +if notExists "${TMP_PATH}/complex_result.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" complexsearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \ + || fail "ComplexSearch died" +fi + +if notExists "complex_db.dbtype"; then + # shellcheck disable=SC2086 + $MMSEQS filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ + || fail "FilterComplex died" + + # build complex db as output + buildCmplDb "${INPUT}" "${TMP_PATH}/complex_db" +fi + +COMP="${TMP_PATH}/complex_db" + +if notExists "${RESULT}.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" clust "${COMP}" "${TMP_PATH}/complex_filt" "${RESULT}" ${CLUSTER_PAR} \ + || fail "Clustering died" +fi + +if [ -n "${REMOVE_TMP}" ]; then + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/complex_filt" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/complex_result" ${VERBOSITY_PAR} + rm -rf "${TMP_PATH}/complexsearch_tmp" + rm -f "${TMP_PATH}/complexcluster.sh" +fi \ No newline at end of file diff --git a/src/workflow/testCC.cpp b/src/workflow/testCC.cpp new file mode 100644 index 00000000..03dc95cc --- /dev/null +++ b/src/workflow/testCC.cpp @@ -0,0 +1,82 @@ +#include + +#include "FileUtil.h" +#include "CommandCaller.h" +#include "Util.h" +#include "Debug.h" +#include "LocalParameters.h" + +#include "complexcluster.sh.h" + +void setComplexClusterDefaults(Parameters *p) { + //TODO, parameters for search, filtercomplex, cluster, createresults + p->covThr = 0.8; + p->covMode = 1; + p->clusteringMode = Parameters::GREEDY; + p->addBacktrace = true; + + // p->sensitivity = 4; + // p->evalThr = 0.001; + // p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; + // p->gapOpen = 10; + // p->gapExtend = 1; +} + +void setComplexClusterMustPassAlong(Parameters *p) { + p->PARAM_C.wasSet = true; + p->PARAM_ADD_BACKTRACE.wasSet = true; + // p->PARAM_E.wasSet = true; + // p->PARAM_S.wasSet = true; + // p->PARAM_ALIGNMENT_MODE.wasSet = true; + +} +int complexcluster(int argc, const char **argv, const Command &command) { + LocalParameters &par = LocalParameters::getLocalInstance(); + // TODO : figure out if commented params needed + // par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); + // par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + +\ + setComplexClusterDefaults(&par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + setComplexClusterMustPassAlong(&par); + + std::string tmpDir = par.filenames.back(); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + if (par.reuseLatest) { + hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); + } + tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); + par.filenames.pop_back(); + + CommandCaller cmd; + std::cout< Date: Wed, 28 Feb 2024 19:02:17 +0900 Subject: [PATCH 038/160] Success command run --- data/complexcluster.sh | 16 ++---- data/testcc.sh | 93 --------------------------------- src/workflow/ComplexCluster.cpp | 7 +++ src/workflow/testCC.cpp | 82 ----------------------------- 4 files changed, 10 insertions(+), 188 deletions(-) delete mode 100644 data/testcc.sh delete mode 100644 src/workflow/testCC.cpp diff --git a/data/complexcluster.sh b/data/complexcluster.sh index cd91eba3..e016101d 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -48,17 +48,7 @@ buildCmplDb() { cp "${1}.dbtype" "${2}.dbtype" } -# check number of input variables -[ "$#" -ne 3 ] && echo "Please provide " && exit 1; -check if files exist -[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; -[ -f "$2.dbtype" ] && echo "$2.dbtype already exists!" && exit 1; -[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; - -INPUT=$1 -RESULT=$2 -TMP_PATH=$3 -# SOURCE=$INPUT +[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; if notExists "${TMP_PATH}/complex_result.dbtype"; then # shellcheck disable=SC2086 @@ -75,11 +65,11 @@ if notExists "complex_db.dbtype"; then buildCmplDb "${INPUT}" "${TMP_PATH}/complex_db" fi -INPUT2="${TMP_PATH}/complex_db" +COMP="${TMP_PATH}/complex_db" if notExists "${RESULT}.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" clust "${INPUT2}" "${TMP_PATH}/complex_filt" "${RESULT}" ${CLUSTER_PAR} \ + "$MMSEQS" clust "${COMP}" "${TMP_PATH}/complex_filt" "${RESULT}" ${CLUSTER_PAR} \ || fail "Clustering died" fi diff --git a/data/testcc.sh b/data/testcc.sh deleted file mode 100644 index 7494035a..00000000 --- a/data/testcc.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/sh -e -fail() { - echo "Error: $1" - exit 1 -} - -notExists() { - [ ! -f "$1" ] -} - -exists() { - [ -f "$1" ] -} - -abspath() { - if [ -d "$1" ]; then - (cd "$1"; pwd) - elif [ -f "$1" ]; then - if [ -z "${1##*/*}" ]; then - echo "$(cd "${1%/*}"; pwd)/${1##*/}" - else - echo "$(pwd)/$1" - fi - elif [ -d "$(dirname "$1")" ]; then - echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" - fi -} - -# Shift initial DB to complexDB using soft-linking -# $1: input db -# $2: output db -buildCmplDb() { - touch "${2}" - awk -F"\t" 'BEGIN {OFFSET=0} - FNR==NR{chain_len[$1]=$3;next} - { - if (!($3 in off_arr)) { - off_arr[$3]=OFFSET - } - cmpl_len[$3]+=chain_len[$1];OFFSET+=chain_len[$1] - } - END { - for (cmpl in off_arr) { - print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] - } - }' "${1}.index" "${1}.lookup" > "${2}.index" - ln -s "$(abspath "${1}")" "${2}.0" - cp "${1}.dbtype" "${2}.dbtype" -} - -# check number of input variables -# [ "$#" -ne 3 ] && echo "Please provide " && exit 1; -# check if files exist -# [ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; -# [ -f "$2.dbtype" ] && echo "$2.dbtype already exists!" && exit 1; - -# INPUT=$1 -# RESULT=$2 -# TMP_PATH=$3 - -[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; -# SOURCE=$INPUT -if notExists "${TMP_PATH}/complex_result.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" complexsearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \ - || fail "ComplexSearch died" -fi - -if notExists "complex_db.dbtype"; then - # shellcheck disable=SC2086 - $MMSEQS filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ - || fail "FilterComplex died" - - # build complex db as output - buildCmplDb "${INPUT}" "${TMP_PATH}/complex_db" -fi - -COMP="${TMP_PATH}/complex_db" - -if notExists "${RESULT}.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" clust "${COMP}" "${TMP_PATH}/complex_filt" "${RESULT}" ${CLUSTER_PAR} \ - || fail "Clustering died" -fi - -if [ -n "${REMOVE_TMP}" ]; then - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_filt" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_result" ${VERBOSITY_PAR} - rm -rf "${TMP_PATH}/complexsearch_tmp" - rm -f "${TMP_PATH}/complexcluster.sh" -fi \ No newline at end of file diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index e7675a0a..03dc95cc 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -45,6 +45,7 @@ int complexcluster(int argc, const char **argv, const Command &command) { // par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); // par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); +\ setComplexClusterDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); setComplexClusterMustPassAlong(&par); @@ -58,6 +59,12 @@ int complexcluster(int argc, const char **argv, const Command &command) { par.filenames.pop_back(); CommandCaller cmd; + std::cout< - -#include "FileUtil.h" -#include "CommandCaller.h" -#include "Util.h" -#include "Debug.h" -#include "LocalParameters.h" - -#include "complexcluster.sh.h" - -void setComplexClusterDefaults(Parameters *p) { - //TODO, parameters for search, filtercomplex, cluster, createresults - p->covThr = 0.8; - p->covMode = 1; - p->clusteringMode = Parameters::GREEDY; - p->addBacktrace = true; - - // p->sensitivity = 4; - // p->evalThr = 0.001; - // p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; - // p->gapOpen = 10; - // p->gapExtend = 1; -} - -void setComplexClusterMustPassAlong(Parameters *p) { - p->PARAM_C.wasSet = true; - p->PARAM_ADD_BACKTRACE.wasSet = true; - // p->PARAM_E.wasSet = true; - // p->PARAM_S.wasSet = true; - // p->PARAM_ALIGNMENT_MODE.wasSet = true; - -} -int complexcluster(int argc, const char **argv, const Command &command) { - LocalParameters &par = LocalParameters::getLocalInstance(); - // TODO : figure out if commented params needed - // par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); - -\ - setComplexClusterDefaults(&par); - par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); - setComplexClusterMustPassAlong(&par); - - std::string tmpDir = par.filenames.back(); - std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); - if (par.reuseLatest) { - hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); - } - tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); - par.filenames.pop_back(); - - CommandCaller cmd; - std::cout< Date: Thu, 29 Feb 2024 00:10:24 +0900 Subject: [PATCH 039/160] Solved complexsearch parameter not applied problem --- data/complexcluster.sh | 4 ++-- data/easycomplexcluster.sh | 8 ++------ src/workflow/ComplexCluster.cpp | 1 - src/workflow/EasyComplexCluster.cpp | 10 ++++++---- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index e016101d..807e37bd 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -48,7 +48,7 @@ buildCmplDb() { cp "${1}.dbtype" "${2}.dbtype" } -[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; +# [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; if notExists "${TMP_PATH}/complex_result.dbtype"; then # shellcheck disable=SC2086 @@ -58,7 +58,7 @@ fi if notExists "complex_db.dbtype"; then # shellcheck disable=SC2086 - $MMSEQS filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ + "$MMSEQS" filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ || fail "FilterComplex died" # build complex db as output diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index f19c9d9f..58d0eae7 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -12,11 +12,6 @@ exists() { [ -f "$1" ] } -# check number of input variables -# [ "$#" -ne 3 ] && echo "Please provide " && exit 1; -# REVIEW: TMP_DIR was already made by easycomplexcluster.cpp # check if files exist -# [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3"; - if notExists "${TMP_PATH}/input.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/input" ${CREATEDB_PAR} \ @@ -24,7 +19,8 @@ if notExists "${TMP_PATH}/input.dbtype"; then fi if notExists "${TMP_PATH}/complex_clust.dbtype"; then - $MMSEQS complexcluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clust" "${TMP_PATH}" "${COMPLEXCLUSTER_PAR}" \ + # shellcheck disable=SC2086 + "$MMSEQS" complexcluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clust" "${TMP_PATH}" ${COMPLEXCLUSTER_PAR} \ || fail "Complexcluster died" fi diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index 03dc95cc..83909793 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -45,7 +45,6 @@ int complexcluster(int argc, const char **argv, const Command &command) { // par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); // par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); -\ setComplexClusterDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); setComplexClusterMustPassAlong(&par); diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index 7113ac86..3ba989f8 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -9,7 +9,7 @@ #include "easycomplexcluster.sh.h" void setEasyComplexClusterDefaults(Parameters *p) { - p->removeTmpFiles = true; + // p->removeTmpFiles = true; } void setEasyComplexClusterMustPassAlong(Parameters *p) { @@ -25,15 +25,16 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); - setEasyComplexClusterDefaults(&par); + // setEasyComplexClusterDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); - setEasyComplexClusterMustPassAlong(&par); + // setEasyComplexClusterMustPassAlong(&par); std::string tmpDir = par.filenames.back(); std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); if (par.reuseLatest) { hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); } + tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); par.filenames.pop_back(); @@ -46,7 +47,7 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { cmd.addVariable("RUNNER", par.runner.c_str()); cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); - cmd.addVariable("COMPLEXCLUSTER_PAR", par.createParameterString(par.complexclusterworkflow, true).c_str()); + cmd.addVariable("COMPLEXCLUSTER_PAR", par.createParameterString(par.complexclusterworkflow,true).c_str()); cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("RESULT2REPSEQ_PAR", par.createParameterString(par.result2repseq).c_str()); cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); @@ -54,6 +55,7 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { std::string program = tmpDir + "/easycomplexcluster.sh"; FileUtil::writeFile(program, easycomplexcluster_sh, easycomplexcluster_sh_len); + cmd.execProgram(program.c_str(), par.filenames); // Should never get here From e3defcd400519c0b02ed166e4a15a1be97600f68 Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 29 Feb 2024 18:29:14 +0900 Subject: [PATCH 040/160] separated buildCmplDb from filtercomplex --- data/complexcluster.sh | 5 +++-- data/complexsearch.sh | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index 807e37bd..2f19bf78 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -56,11 +56,12 @@ if notExists "${TMP_PATH}/complex_result.dbtype"; then || fail "ComplexSearch died" fi -if notExists "complex_db.dbtype"; then +if notExists "complex_filt.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ || fail "FilterComplex died" - +fi +if notExists "${TMP_PATH}/complex_db.dbtype"; then # build complex db as output buildCmplDb "${INPUT}" "${TMP_PATH}/complex_db" fi diff --git a/data/complexsearch.sh b/data/complexsearch.sh index 0ce67fea..d721d5ac 100644 --- a/data/complexsearch.sh +++ b/data/complexsearch.sh @@ -28,6 +28,7 @@ if [ "$PREFMODE" != "EXHAUSTIVE" ]; then fi RESULT="${TMP_PATH}/result_expand_aligned" fi + if notExists "${TMP_PATH}/complex_result.dbtype"; then # shellcheck disable=SC2086 $MMSEQS scorecomplex "${QUERYDB}" "${TARGETDB}" "${RESULT}" "${OUTPUT}" ${SCORECOMPLEX_PAR} \ From 7d34bcbb99e1dee6379b1f987559048e79d6e315 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 29 Feb 2024 21:45:05 +0900 Subject: [PATCH 041/160] default parameters --- src/commons/LocalParameters.cpp | 2 +- src/strucclustutils/filtercomplex.cpp | 5 ---- src/workflow/ComplexCluster.cpp | 36 +++++++++------------------ 3 files changed, 13 insertions(+), 30 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 9577aa40..2a3ac58d 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -196,7 +196,7 @@ LocalParameters::LocalParameters() : easyscomplexsearchworkflow.push_back(&PARAM_COMPLEX_REPORT_MODE); // complexclusterworkflow - complexclusterworkflow = combineList(complexsearchworkflow, filtercomplex); + complexclusterworkflow = combineList(complexsearchworkflow, filtercomplex); complexclusterworkflow = combineList(complexclusterworkflow, clust); //easycomplexclusterworkflow diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index fcdfa780..c53cc3ee 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -63,11 +63,6 @@ std::vector selecHighestCoverage( std::map::USE_INDEX); diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index 83909793..4d72f30c 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -9,41 +9,29 @@ #include "complexcluster.sh.h" void setComplexClusterDefaults(Parameters *p) { - //TODO, parameters for search, filtercomplex, cluster, createresults p->covThr = 0.8; p->covMode = 1; p->clusteringMode = Parameters::GREEDY; - p->addBacktrace = true; - - // p->sensitivity = 4; - // p->evalThr = 0.001; - // p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; - // p->gapOpen = 10; - // p->gapExtend = 1; } void setComplexClusterMustPassAlong(Parameters *p) { p->PARAM_C.wasSet = true; - p->PARAM_ADD_BACKTRACE.wasSet = true; - // p->PARAM_E.wasSet = true; - // p->PARAM_S.wasSet = true; - // p->PARAM_ALIGNMENT_MODE.wasSet = true; } int complexcluster(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); - // TODO : figure out if commented params needed - // par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); - // par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); //prefilter + par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + for (size_t i = 0; i < par.createdb.size(); i++){ + par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); + } + par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + setComplexClusterDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); From 06945aed7aebce891be96c840290ac00c58c7986 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 29 Feb 2024 22:19:59 +0900 Subject: [PATCH 042/160] Parameters --- src/workflow/ComplexCluster.cpp | 3 ++- src/workflow/EasyComplexCluster.cpp | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index 4d72f30c..435ea411 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -12,11 +12,12 @@ void setComplexClusterDefaults(Parameters *p) { p->covThr = 0.8; p->covMode = 1; p->clusteringMode = Parameters::GREEDY; + p->removeTmpFiles = true; } void setComplexClusterMustPassAlong(Parameters *p) { p->PARAM_C.wasSet = true; - + p->PARAM_REMOVE_TMP_FILES.wasSet = true; } int complexcluster(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index 3ba989f8..9965c4d5 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -9,15 +9,27 @@ #include "easycomplexcluster.sh.h" void setEasyComplexClusterDefaults(Parameters *p) { - // p->removeTmpFiles = true; + //TODO + p->removeTmpFiles = true; + p->writeLookup = true; } void setEasyComplexClusterMustPassAlong(Parameters *p) { + //TODO p->PARAM_REMOVE_TMP_FILES.wasSet = true; + p->PARAM_WRITE_LOOKUP.wasSet = true; } int easycomplexcluster(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); + //TODO + par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); //prefilter + par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT); + for (size_t i = 0; i < par.createdb.size(); i++){ par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); } @@ -25,9 +37,9 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); - // setEasyComplexClusterDefaults(&par); + setEasyComplexClusterDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); - // setEasyComplexClusterMustPassAlong(&par); + setEasyComplexClusterMustPassAlong(&par); std::string tmpDir = par.filenames.back(); std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); From 819a75a8b9b7e046ff04ae9caae9a3e49a5f0d89 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 29 Feb 2024 22:36:36 +0900 Subject: [PATCH 043/160] add description --- src/FoldseekBase.cpp | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 051b7b33..68287ee0 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -277,7 +277,14 @@ std::vector foldseekCommands = { }, {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_MAIN, "Complex level cluster", - "foldseek complexcluster queryDB result tmp\n", + "#Clustering of PDB DB\n" + "foldseek complexcluster queryDB result tmp\n" + "# --cov-mode \n" + "# Sequence 0 1 2\n" + "# Q: MAVGTACRPA 60% IGN 60%\n" + "# T: -AVGTAC--- 60% 100% IGN\n" + "# Cutoff -c 0.7 - + -\n" + "# -c 0.6 + + +\n\n", "Seongeun Kim & Sooyoung Cha ", " ", CITATION_FOLDSEEK, { @@ -288,7 +295,19 @@ std::vector foldseekCommands = { }, {"easy-complexcluster", easycomplexcluster, &localPar.easycomplexclusterworkflow, COMMAND_EASY, "Complex level cluster", - "foldseek easy-complexcluster examples/ result tmp\n", + "#Clustering of PDB files\n" + "foldseek easy-complexcluster examples/ result tmp\n" + "# Cluster output\n" + "# - result_rep_seq.fasta: Representatives\n" + "# - result_all_seq.fasta: FASTA-like per cluster\n" + "# - result_cluster.tsv: Adjacency list\n\n" + "# Important parameter: --cov-mode and -c \n" + "# --cov-mode \n" + "# 0 1 2\n" + "# Q: MAVGTACRPA 60% IGN 60%\n" + "# T: -AVGTAC--- 60% 100% IGN\n" + "# -c 0.7 - + -\n" + "# -c 0.6 + + +\n\n", "Seongeun Kim & Sooyoung Cha ", " ... ", CITATION_FOLDSEEK, { From fe9865cba133a778645c8dba0c226dce5afd94e6 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 29 Feb 2024 22:40:40 +0900 Subject: [PATCH 044/160] small changes --- src/FoldseekBase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 68287ee0..a931f1e9 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -278,7 +278,7 @@ std::vector foldseekCommands = { {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_MAIN, "Complex level cluster", "#Clustering of PDB DB\n" - "foldseek complexcluster queryDB result tmp\n" + "foldseek complexcluster queryDB clusterDB tmp\n" "# --cov-mode \n" "# Sequence 0 1 2\n" "# Q: MAVGTACRPA 60% IGN 60%\n" From fa875fdf1169d9712f3f16f12e341a698fc9cfbc Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 4 Mar 2024 02:52:03 +0900 Subject: [PATCH 045/160] making complex header file --- data/complexcluster.sh | 65 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index 2f19bf78..9947583b 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -47,6 +47,58 @@ buildCmplDb() { ln -s "$(abspath "${1}")" "${2}.0" cp "${1}.dbtype" "${2}.dbtype" } +# Shift initial header DB into complex header DB +buildheadCmplDb() { + awk -F"\t" ' + FNR==NR{ + split($2, parts, ".pdb") + HEADS = parts[1] + if (!($3 in cmplid)){ + cmplid[$3] = HEADS + + }; + next + } + { + split($1, part, ".pdb") + COMP = substr(part[1], 2) + if (!(COMP in head_arr)) { + HEAD = substr(part[2], 4) + head_arr[COMP] = HEAD + } + } + END { + for (cmpl in cmplid) { + print cmplid[cmpl]".pdb", head_arr[cmplid[cmpl]] + } + }' "${1}.lookup" "${1}_h" > "${2}" + cp "${1}.dbtype" "${2}.dbtype" +} + +buildheadIndexCmplDb() { + awk -F"\t" 'BEGIN {OFFSET=0} + FNR==NR{ + if (!($3 in cmplchain)){ + cmplchain[$3]=$1 + }; + for (cmpl in cmplchain) { + chaincmpl[cmplchain[cmpl]]=cmpl + }; + next + } + { + if (($1 in chaincmpl)) { + off_arr[chaincmpl[$1]]=OFFSET + OFFSET+=$3 + cmpl_len[chaincmpl[$1]]=$3 + } + } + END { + for (cmpl in off_arr) { + print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] + } + }' "${1}.lookup" "${1}_h.index" > "${2}.index" +} # [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; @@ -61,10 +113,23 @@ if notExists "complex_filt.dbtype"; then "$MMSEQS" filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ || fail "FilterComplex died" fi + +# shift query DB, .index, .dbtype if notExists "${TMP_PATH}/complex_db.dbtype"; then # build complex db as output buildCmplDb "${INPUT}" "${TMP_PATH}/complex_db" fi +# Shift _h, _h.dbtype +if notExists "${TMP_PATH}/complex_db_h.dbtype"; then + # build complex header db as output + buildheadCmplDb "${INPUT}" "${TMP_PATH}/complex_db_h" +fi + +# Shift _h.index +if notExists "${TMP_PATH}/complex_db_h.index"; then + # build complex header.index as output + buildheadIndexCmplDb "${INPUT}" "${TMP_PATH}/complex_db_h" +fi COMP="${TMP_PATH}/complex_db" From 412b51d023b1ce4e7a4de9ca2a3bf707d51b82a4 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 4 Mar 2024 14:45:06 +0900 Subject: [PATCH 046/160] header file --- data/complexcluster.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index 9947583b..d96814c7 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -52,7 +52,7 @@ buildheadCmplDb() { awk -F"\t" ' FNR==NR{ split($2, parts, ".pdb") - HEADS = parts[1] + HEADS = "\0"parts[1] if (!($3 in cmplid)){ cmplid[$3] = HEADS @@ -61,7 +61,7 @@ buildheadCmplDb() { } { split($1, part, ".pdb") - COMP = substr(part[1], 2) + COMP = part[1] if (!(COMP in head_arr)) { HEAD = substr(part[2], 4) head_arr[COMP] = HEAD From 86f5fe9e045665aa1262ac7df1a1679cb56680c2 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 4 Mar 2024 16:20:51 +0900 Subject: [PATCH 047/160] colsed easycc --- data/complexcluster.sh | 66 +++------------------------------ data/easycomplexcluster.sh | 23 +++++++++--- src/workflow/ComplexCluster.cpp | 3 +- 3 files changed, 25 insertions(+), 67 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index d96814c7..badd6043 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -47,58 +47,6 @@ buildCmplDb() { ln -s "$(abspath "${1}")" "${2}.0" cp "${1}.dbtype" "${2}.dbtype" } -# Shift initial header DB into complex header DB -buildheadCmplDb() { - awk -F"\t" ' - FNR==NR{ - split($2, parts, ".pdb") - HEADS = "\0"parts[1] - if (!($3 in cmplid)){ - cmplid[$3] = HEADS - - }; - next - } - { - split($1, part, ".pdb") - COMP = part[1] - if (!(COMP in head_arr)) { - HEAD = substr(part[2], 4) - head_arr[COMP] = HEAD - } - } - END { - for (cmpl in cmplid) { - print cmplid[cmpl]".pdb", head_arr[cmplid[cmpl]] - } - }' "${1}.lookup" "${1}_h" > "${2}" - cp "${1}.dbtype" "${2}.dbtype" -} - -buildheadIndexCmplDb() { - awk -F"\t" 'BEGIN {OFFSET=0} - FNR==NR{ - if (!($3 in cmplchain)){ - cmplchain[$3]=$1 - }; - for (cmpl in cmplchain) { - chaincmpl[cmplchain[cmpl]]=cmpl - }; - next - } - { - if (($1 in chaincmpl)) { - off_arr[chaincmpl[$1]]=OFFSET - OFFSET+=$3 - cmpl_len[chaincmpl[$1]]=$3 - } - } - END { - for (cmpl in off_arr) { - print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl] - } - }' "${1}.lookup" "${1}_h.index" > "${2}.index" -} # [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; @@ -119,16 +67,12 @@ if notExists "${TMP_PATH}/complex_db.dbtype"; then # build complex db as output buildCmplDb "${INPUT}" "${TMP_PATH}/complex_db" fi -# Shift _h, _h.dbtype -if notExists "${TMP_PATH}/complex_db_h.dbtype"; then - # build complex header db as output - buildheadCmplDb "${INPUT}" "${TMP_PATH}/complex_db_h" -fi -# Shift _h.index -if notExists "${TMP_PATH}/complex_db_h.index"; then - # build complex header.index as output - buildheadIndexCmplDb "${INPUT}" "${TMP_PATH}/complex_db_h" +# Shift _h, _h.dbtype +if notExists "${TMP_PATH}/complex_db_h.dbtype"; then + # shellcheck disable=SC2086 + "$MMSEQS" tsv2db "${INPUT}.source" "${TMP_PATH}/complex_db_h" ${VERBOSITY_PAR} \ + || fail "tsv2db died" fi COMP="${TMP_PATH}/complex_db" diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 58d0eae7..d9d3afed 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -12,21 +12,34 @@ exists() { [ -f "$1" ] } +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + + if notExists "${TMP_PATH}/input.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/input" ${CREATEDB_PAR} \ || fail "input createdb died" fi -if notExists "${TMP_PATH}/complex_clust.dbtype"; then +if notExists "${TMP_PATH}/complex_clu.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" complexcluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clust" "${TMP_PATH}" ${COMPLEXCLUSTER_PAR} \ + "$MMSEQS" complexcluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clu" "$(dirname "${RESULT}")" ${COMPLEXCLUSTER_PAR} \ || fail "Complexcluster died" fi -# TODO: copmlex_db need header/lookup? -# SOURCE=$INPUT -INPUT="${TMP_PATH}/complex_db" +INPUT="$(dirname "${RESULT}")/latest/complex_db" if notExists "${TMP_PATH}/cluster.tsv"; then # shellcheck disable=SC2086 "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index 435ea411..f06c15cf 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -59,7 +59,8 @@ int complexcluster(int argc, const char **argv, const Command &command) { cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clust).c_str()); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); - + cmd.addVariable("VERBCOMPRESS", par.createParameterString(par.verbandcompression).c_str()); + std::string program = tmpDir + "/complexcluster.sh"; FileUtil::writeFile(program, complexcluster_sh, complexcluster_sh_len); cmd.execProgram(program.c_str(), par.filenames); From 7e5bd089b5a306cc2fc07f95bdceca4ebd17b19b Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 4 Mar 2024 16:25:17 +0900 Subject: [PATCH 048/160] changed tmp dir --- data/easycomplexcluster.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index d9d3afed..438f8645 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -35,11 +35,11 @@ fi if notExists "${TMP_PATH}/complex_clu.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" complexcluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clu" "$(dirname "${RESULT}")" ${COMPLEXCLUSTER_PAR} \ + "$MMSEQS" complexcluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clu" "${TMP_PATH}" ${COMPLEXCLUSTER_PAR} \ || fail "Complexcluster died" fi -INPUT="$(dirname "${RESULT}")/latest/complex_db" +INPUT="${TMP_PATH}/latest/complex_db" if notExists "${TMP_PATH}/cluster.tsv"; then # shellcheck disable=SC2086 "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ @@ -84,5 +84,7 @@ if [ -n "${REMOVE_TMP}" ]; then "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_rep" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${INPUT}" ${VERBOSITY_PAR} rm -f "${TMP_PATH}/easycomplexcluster.sh" fi \ No newline at end of file From d16ac1001b3c856e934642b8cc9abe96b8a953fb Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 4 Mar 2024 16:28:29 +0900 Subject: [PATCH 049/160] tmp remove --- data/easycomplexcluster.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 438f8645..02be2ec2 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -86,5 +86,8 @@ if [ -n "${REMOVE_TMP}" ]; then "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${INPUT}" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${INPUT}_h" ${VERBOSITY_PAR} + rm -rf "${TMP_PATH}/latest" rm -f "${TMP_PATH}/easycomplexcluster.sh" fi \ No newline at end of file From 560c6e49566d1cd757f80d11a2fa5c92116da5eb Mon Sep 17 00:00:00 2001 From: rachelse Date: Mon, 4 Mar 2024 17:28:39 +0900 Subject: [PATCH 050/160] temporary Result2repseq --- data/easycomplexcluster.sh | 43 +++++++++++++++++++++++++++----------- data/test.sh | 32 ++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 12 deletions(-) create mode 100755 data/test.sh diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 02be2ec2..6e288f2f 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -42,30 +42,49 @@ fi INPUT="${TMP_PATH}/latest/complex_db" if notExists "${TMP_PATH}/cluster.tsv"; then # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clust" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ || fail "Convert Alignments died" fi -#TODO: figure out how to represent complex sequences as a single fasta entry? +#TODO: move it to complexcluster.sh? + +mapCmpl2Chain() { + awk 'BEGIN {FS="\t"} + NR==FNR { + if (!($0 ~ /^[0-9]+$/)) { + split($1,name,"\0") + reps[name[2]]=$1;next + } else if (FNR==1) { + reps[$1]=$1 + } + next + } + { if ($3 in reps) { + print "\0"$1 + } + }' "${1}" "${2}".lookup > "${3}" +} + if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then + mapCmpl2Chain "${TMP_PATH}/complex_clust" "${INPUT}" "${TMP_PATH}/complex_clust_chains" # shellcheck disable=SC2086 - "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_rep" ${RESULT2REPSEQ_PAR} \ + "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/complex_clust_chains" "${TMP_PATH}/complex_clust_rep" ${RESULT2REPSEQ_PAR} \ || fail "Result2repseq died" # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_rep" "${TMP_PATH}/complex_rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ + "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clust_rep" "${TMP_PATH}/complex_rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ || fail "result2flat died" fi -if notExists "${TMP_PATH}/complex_all_seqs.fasta"; then - # shellcheck disable=SC2086 - "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/complex_clu_seqs" ${THREADS_PAR} \ - || fail "Result2repseq died" +# if notExists "${TMP_PATH}/complex_all_seqs.fasta"; then +# # shellcheck disable=SC2086 +# "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/complex_clust" "${TMP_PATH}/complex_clust_seqs" ${THREADS_PAR} \ +# || fail "Result2repseq died" - # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu_seqs" "${TMP_PATH}/complex_all_seqs.fasta" ${VERBOSITY_PAR} \ - || fail "result2flat died" -fi +# # shellcheck disable=SC2086 +# "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clust_seqs" "${TMP_PATH}/complex_all_seqs.fasta" ${VERBOSITY_PAR} \ +# || fail "result2flat died" +# fi mv "${TMP_PATH}/complex_all_seqs.fasta" "${RESULT}_all_seqs.fasta" mv "${TMP_PATH}/complex_rep_seq.fasta" "${RESULT}_rep_seq.fasta" diff --git a/data/test.sh b/data/test.sh new file mode 100755 index 00000000..60111a46 --- /dev/null +++ b/data/test.sh @@ -0,0 +1,32 @@ +#!/bin/sh -e + + +mapCmpl2Chain() { + awk -F"\t" 'BEGIN {} + NR==FNR { + if ($0 ~ /^[0-9]+$/) { + reps[$1]=$1;next + } + } + { if ($3 in reps) { + print $1 + print "\0"$1 + } + } + ' "${1}" "${2}".lookup > "${3}" +} + +# if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then +# mapCmpl2Chain "${TMP_PATH}/complex_clust" "${INPUT}" "${TMP_PATH}/complex_clust_chains" +# # shellcheck disable=SC2086 +# "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/complex_clust_chains" "${TMP_PATH}/complex_clu_rep" ${RESULT2REPSEQ_PAR} \ +# || fail "Result2repseq died" + +# # shellcheck disable=SC2086 +# "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clust_rep" "${TMP_PATH}/complex_rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ +# || fail "result2flat died" +# fi + +lookupfile="/Users/steineggerlab/Desktop/foldseek/toydata/emmanuel_db/emmanuel_7soy_db" +testfile="/Users/steineggerlab/Desktop/foldseek/toydata/emmanuel_7soy_easycc_tmp/9595115010967635338/complex_clust" +mapCmpl2Chain $testfile $lookupfile "/Users/steineggerlab/Desktop/foldseek/toydata/emmanuel_7soy_easycc_tmp/9595115010967635338/complex_clust_reps" From 4ceb9adaae1baad6b0eadbd420303f936f446fbe Mon Sep 17 00:00:00 2001 From: rachelse Date: Tue, 5 Mar 2024 01:38:57 +0900 Subject: [PATCH 051/160] Completed to output rep seqs fasta file --- data/easycomplexcluster.sh | 53 +++++++++++++++-------------- data/test.sh | 32 ----------------- src/commons/LocalParameters.cpp | 1 + src/workflow/EasyComplexCluster.cpp | 1 + 4 files changed, 30 insertions(+), 57 deletions(-) delete mode 100755 data/test.sh diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 6e288f2f..80a16a27 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -39,43 +39,45 @@ if notExists "${TMP_PATH}/complex_clu.dbtype"; then || fail "Complexcluster died" fi +SOURCE="${TMP_PATH}/input" INPUT="${TMP_PATH}/latest/complex_db" if notExists "${TMP_PATH}/cluster.tsv"; then # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clust" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ || fail "Convert Alignments died" fi -#TODO: move it to complexcluster.sh? - -mapCmpl2Chain() { - awk 'BEGIN {FS="\t"} - NR==FNR { - if (!($0 ~ /^[0-9]+$/)) { - split($1,name,"\0") - reps[name[2]]=$1;next - } else if (FNR==1) { - reps[$1]=$1 - } - next +#NOTE: move it to complexcluster.sh? +mapCmplName2ChainKeys() { + repComp=$(awk -F"\t" '{print $1}' "${1}" | sort -u | awk '{printf "%s ", $0}' ) + awk -F"\t" -v repComp="${repComp}" ' + BEGIN { + split(repComp,repCompArr," "); + for (i in repCompArr) {repCompArr[repCompArr[i]]=""} + } + NR==FNR && ($2 in repCompArr){ + repIdxArr[$1]="";next } - { if ($3 in reps) { - print "\0"$1 + NR!=FNR && ($3 in repIdxArr) { + print $1 } - }' "${1}" "${2}".lookup > "${3}" + ' "${2}.source" "${2}.lookup" > "${3}" } -if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then - mapCmpl2Chain "${TMP_PATH}/complex_clust" "${INPUT}" "${TMP_PATH}/complex_clust_chains" +if notExists "${TMP_PATH}/complex_rep_seqs.dbtype"; then + mapCmplName2ChainKeys "${TMP_PATH}/cluster.tsv" "${SOURCE}" "${TMP_PATH}/rep_seqs.list" # shellcheck disable=SC2086 - "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/complex_clust_chains" "${TMP_PATH}/complex_clust_rep" ${RESULT2REPSEQ_PAR} \ - || fail "Result2repseq died" + "$MMSEQS" createsubdb "${TMP_PATH}/rep_seqs.list" "${SOURCE}" "${TMP_PATH}/complex_rep_seqs" ${CREATESUBDB_PAR} \ + || fail "createsubdb died" +fi +if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clust_rep" "${TMP_PATH}/complex_rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ + "$MMSEQS" result2flat "${SOURCE}" "${SOURCE}" "${TMP_PATH}/complex_rep_seqs" "${TMP_PATH}/complex_rep_seq.fasta" ${VERBOSITY_PAR} \ || fail "result2flat died" fi +#TODO: generate fasta file for all sequences # if notExists "${TMP_PATH}/complex_all_seqs.fasta"; then # # shellcheck disable=SC2086 # "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/complex_clust" "${TMP_PATH}/complex_clust_seqs" ${THREADS_PAR} \ @@ -86,7 +88,7 @@ fi # || fail "result2flat died" # fi -mv "${TMP_PATH}/complex_all_seqs.fasta" "${RESULT}_all_seqs.fasta" +# mv "${TMP_PATH}/complex_all_seqs.fasta" "${RESULT}_all_seqs.fasta" mv "${TMP_PATH}/complex_rep_seq.fasta" "${RESULT}_rep_seq.fasta" mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" @@ -97,16 +99,17 @@ if [ -n "${REMOVE_TMP}" ]; then "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_db" ${VERBOSITY_PAR} + # # shellcheck disable=SC2086 + # "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_seqs" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_seqs" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_rep" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/complex_rep_seqs" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${INPUT}" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${INPUT}_h" ${VERBOSITY_PAR} + rm "${TMP_PATH}/rep_seqs.list" rm -rf "${TMP_PATH}/latest" rm -f "${TMP_PATH}/easycomplexcluster.sh" fi \ No newline at end of file diff --git a/data/test.sh b/data/test.sh deleted file mode 100755 index 60111a46..00000000 --- a/data/test.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/sh -e - - -mapCmpl2Chain() { - awk -F"\t" 'BEGIN {} - NR==FNR { - if ($0 ~ /^[0-9]+$/) { - reps[$1]=$1;next - } - } - { if ($3 in reps) { - print $1 - print "\0"$1 - } - } - ' "${1}" "${2}".lookup > "${3}" -} - -# if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then -# mapCmpl2Chain "${TMP_PATH}/complex_clust" "${INPUT}" "${TMP_PATH}/complex_clust_chains" -# # shellcheck disable=SC2086 -# "$MMSEQS" result2repseq "${INPUT}" "${TMP_PATH}/complex_clust_chains" "${TMP_PATH}/complex_clu_rep" ${RESULT2REPSEQ_PAR} \ -# || fail "Result2repseq died" - -# # shellcheck disable=SC2086 -# "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clust_rep" "${TMP_PATH}/complex_rep_seq.fasta" --use-fasta-header ${VERBOSITY_PAR} \ -# || fail "result2flat died" -# fi - -lookupfile="/Users/steineggerlab/Desktop/foldseek/toydata/emmanuel_db/emmanuel_7soy_db" -testfile="/Users/steineggerlab/Desktop/foldseek/toydata/emmanuel_7soy_easycc_tmp/9595115010967635338/complex_clust" -mapCmpl2Chain $testfile $lookupfile "/Users/steineggerlab/Desktop/foldseek/toydata/emmanuel_7soy_easycc_tmp/9595115010967635338/complex_clust_reps" diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 2a3ac58d..cb37314d 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -193,6 +193,7 @@ LocalParameters::LocalParameters() : easyscomplexsearchworkflow = combineList(structurecreatedb, complexsearchworkflow); easyscomplexsearchworkflow = combineList(easyscomplexsearchworkflow, convertalignments); easyscomplexsearchworkflow = combineList(easyscomplexsearchworkflow, createcomplexreport); + easyscomplexsearchworkflow = combineList(easyscomplexsearchworkflow, createsubdb); easyscomplexsearchworkflow.push_back(&PARAM_COMPLEX_REPORT_MODE); // complexclusterworkflow diff --git a/src/workflow/EasyComplexCluster.cpp b/src/workflow/EasyComplexCluster.cpp index 9965c4d5..345f7b6e 100644 --- a/src/workflow/EasyComplexCluster.cpp +++ b/src/workflow/EasyComplexCluster.cpp @@ -61,6 +61,7 @@ int easycomplexcluster(int argc, const char **argv, const Command &command) { cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); cmd.addVariable("COMPLEXCLUSTER_PAR", par.createParameterString(par.complexclusterworkflow,true).c_str()); cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); + cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str()); cmd.addVariable("RESULT2REPSEQ_PAR", par.createParameterString(par.result2repseq).c_str()); cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); From b25de75fc9d743a06d186dee9276ada4041e09cc Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 5 Mar 2024 13:52:26 +0900 Subject: [PATCH 052/160] remove tmp files --- data/easycomplexcluster.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 80a16a27..7eb82bb6 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -93,19 +93,22 @@ mv "${TMP_PATH}/complex_rep_seq.fasta" "${RESULT}_rep_seq.fasta" mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" if [ -n "${REMOVE_TMP}" ]; then - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} + rm "${INPUT}.0" # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_db" ${VERBOSITY_PAR} - # # shellcheck disable=SC2086 + # shellcheck disable=SC2086 # "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_seqs" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_rep_seqs" ${VERBOSITY_PAR} # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/complex_rep_seqs_h" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 "$MMSEQS" rmdb "${INPUT}" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${INPUT}_h" ${VERBOSITY_PAR} From b7a274541c89035ddfc987470c7c3d2e54f6d85e Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 5 Mar 2024 13:56:02 +0900 Subject: [PATCH 053/160] easy-cc description --- src/FoldseekBase.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index a931f1e9..e8d8d0a9 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -299,7 +299,6 @@ std::vector foldseekCommands = { "foldseek easy-complexcluster examples/ result tmp\n" "# Cluster output\n" "# - result_rep_seq.fasta: Representatives\n" - "# - result_all_seq.fasta: FASTA-like per cluster\n" "# - result_cluster.tsv: Adjacency list\n\n" "# Important parameter: --cov-mode and -c \n" "# --cov-mode \n" From 81ac5ef5f44568d4c6fc4183862dd27e4aad3d7c Mon Sep 17 00:00:00 2001 From: rachelse Date: Tue, 5 Mar 2024 14:08:05 +0900 Subject: [PATCH 054/160] Generates comment about rep complex in fastafile --- data/easycomplexcluster.sh | 47 ++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 7eb82bb6..7d3cfa94 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -26,6 +26,35 @@ abspath() { fi } +mapCmplName2ChainKeys() { + repComp=$(awk -F"\t" '{print $1}' "${1}" | sort -u | awk '{printf "%s ", $0}' ) + awk -F"\t" -v repComp="${repComp}" ' + BEGIN { + split(repComp,repCompArr," "); + for (i in repCompArr) {repCompArr[repCompArr[i]]=""} + } + NR==FNR && ($2 in repCompArr){ + repIdxArr[$1]="";next + } + NR!=FNR && ($3 in repIdxArr) { + print $1 + } + ' "${2}.source" "${2}.lookup" > "${3}" +} + +postprocessFasta() { + awk ' BEGIN {FS=">"} + $0 ~/^>/ { + match($2, /(.*).pdb*/) + complex = substr($2, RSTART, RLENGTH-4) + if (!(complex in repComplex)) { + print "#"complex".pdb" + repComplex[complex] = "" + } + } + {print $0} + ' "${1}" > "${1}.tmp" && mv "${1}.tmp" "${1}" +} if notExists "${TMP_PATH}/input.dbtype"; then # shellcheck disable=SC2086 @@ -47,23 +76,6 @@ if notExists "${TMP_PATH}/cluster.tsv"; then || fail "Convert Alignments died" fi -#NOTE: move it to complexcluster.sh? -mapCmplName2ChainKeys() { - repComp=$(awk -F"\t" '{print $1}' "${1}" | sort -u | awk '{printf "%s ", $0}' ) - awk -F"\t" -v repComp="${repComp}" ' - BEGIN { - split(repComp,repCompArr," "); - for (i in repCompArr) {repCompArr[repCompArr[i]]=""} - } - NR==FNR && ($2 in repCompArr){ - repIdxArr[$1]="";next - } - NR!=FNR && ($3 in repIdxArr) { - print $1 - } - ' "${2}.source" "${2}.lookup" > "${3}" -} - if notExists "${TMP_PATH}/complex_rep_seqs.dbtype"; then mapCmplName2ChainKeys "${TMP_PATH}/cluster.tsv" "${SOURCE}" "${TMP_PATH}/rep_seqs.list" # shellcheck disable=SC2086 @@ -75,6 +87,7 @@ if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then # shellcheck disable=SC2086 "$MMSEQS" result2flat "${SOURCE}" "${SOURCE}" "${TMP_PATH}/complex_rep_seqs" "${TMP_PATH}/complex_rep_seq.fasta" ${VERBOSITY_PAR} \ || fail "result2flat died" + postprocessFasta "${TMP_PATH}/complex_rep_seq.fasta" fi #TODO: generate fasta file for all sequences From 4329b254a07d3f6cd9ac8ddf038d5f2115fb2818 Mon Sep 17 00:00:00 2001 From: rachelse Date: Tue, 5 Mar 2024 14:20:08 +0900 Subject: [PATCH 055/160] Finalized rmdb --- data/easycomplexcluster.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 7d3cfa94..67c4f340 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -125,6 +125,10 @@ if [ -n "${REMOVE_TMP}" ]; then "$MMSEQS" rmdb "${INPUT}" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${INPUT}_h" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input_ca" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/input_ss" ${VERBOSITY_PAR} rm "${TMP_PATH}/rep_seqs.list" rm -rf "${TMP_PATH}/latest" rm -f "${TMP_PATH}/easycomplexcluster.sh" From 369842e2ed1455b90e15ba653df97a49db47548d Mon Sep 17 00:00:00 2001 From: rachelse Date: Fri, 8 Mar 2024 15:05:49 +0900 Subject: [PATCH 056/160] Solved argument list too long issue --- data/easycomplexcluster.sh | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 67c4f340..eea901fa 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -27,19 +27,21 @@ abspath() { } mapCmplName2ChainKeys() { - repComp=$(awk -F"\t" '{print $1}' "${1}" | sort -u | awk '{printf "%s ", $0}' ) - awk -F"\t" -v repComp="${repComp}" ' - BEGIN { - split(repComp,repCompArr," "); - for (i in repCompArr) {repCompArr[repCompArr[i]]=""} + awk -F"\t" ' + # BEGIN { + # split(repComp,repCompArr," "); + # for (i in repCompArr) {repCompArr[repCompArr[i]]=""} + # } + NR==FNR { + repName_memName[$1]=$2;next } - NR==FNR && ($2 in repCompArr){ - repIdxArr[$1]="";next + { + split($2,cmpNameArr,".pdb"); cmpl=cmpNameArr[1]".pdb" + if (cmpl in repName_memName) { + print $1 + } } - NR!=FNR && ($3 in repIdxArr) { - print $1 - } - ' "${2}.source" "${2}.lookup" > "${3}" + ' "${1}" "${2}.lookup" > "${3}" } postprocessFasta() { From ff5a8e5ff308658214b3cfd7cc71e007a2f3592a Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 13 Mar 2024 15:21:57 +0900 Subject: [PATCH 057/160] filtercomplex tmp coverage.tsv --- data/complexcluster.sh | 2 +- src/FoldseekBase.cpp | 4 +++- src/strucclustutils/filtercomplex.cpp | 27 +++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index badd6043..ca3deeb7 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -58,7 +58,7 @@ fi if notExists "complex_filt.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" ${FILTERCOMPLEX_PAR} \ + "$MMSEQS" filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" "${TMP_PATH}/filtcov.tsv" ${FILTERCOMPLEX_PAR} \ || fail "FilterComplex died" fi diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 48885056..07ffa19b 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -280,7 +280,9 @@ std::vector foldseekCommands = { {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, - {"clustDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb } + {"clustDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, + + {"tmptsv", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::flatfile } } }, {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_MAIN, diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index c53cc3ee..7c91a6c0 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -69,6 +69,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { std::map qKeyToSet; std::map tKeyToSet; char buffer[32]; + char buffer5[32]; IndexReader* qDbr; qDbr = new IndexReader(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); @@ -92,6 +93,10 @@ int filtercomplex(int argc, const char **argv, const Command &command) { DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), 1, shouldCompress, db4Type); resultWriter.open(); + const int db5Type = Parameters::DBTYPE_GENERIC_DB; + DBWriter resultWrite5(par.db5.c_str(), par.db5Index.c_str(), 1, shouldCompress, db5Type); + resultWrite5.open(); + std::string qLookupFile = par.db1 + ".lookup"; std::string tLookupFile = par.db2 + ".lookup"; TranslateNucl translateNucl(static_cast(par.translationTable)); @@ -173,6 +178,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { } } std::string result; + std::string result5; std::vector keysToDelete; for (const auto& pair : qcovSum){ float qcov = static_cast(pair.second) / static_cast(qComplexLength[qComplexId]); @@ -210,18 +216,39 @@ int filtercomplex(int argc, const char **argv, const Command &command) { case Parameters::COV_MODE_QUERY: selectedAssIDs = selecHighestCoverage(qcompIdToassIdToalnSum); break; + case Parameters::COV_MODE_LENGTH_QUERY : + break; + case Parameters::COV_MODE_LENGTH_TARGET : + break; + case Parameters::COV_MODE_LENGTH_SHORTER : + break; + + // TODO : other coverage modes } for (const auto& pair : qcovSum){ if (std::find(selectedAssIDs.begin(), selectedAssIDs.end(), pair.first) != selectedAssIDs.end()){ char *outpos = Itoa::u32toa_sse2(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]], buffer); result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); + if (par.covMode == Parameters::COV_MODE_BIDIRECTIONAL) { + result5.append(std::to_string(qComplexId) + "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(pair.second/static_cast(qComplexLength[qComplexId])) + "\t" + std::to_string(tcovSum[pair.first]/ static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]])) + "\n"); + } + else if (par.covMode == Parameters::COV_MODE_TARGET){ + result5.append(std::to_string(qComplexId) + "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(tcovSum[pair.first]/ static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]])) + "\n"); + + } + else if (par.covMode == Parameters::COV_MODE_QUERY) { + result5.append(std::to_string(qComplexId) + "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(pair.second/static_cast(qComplexLength[qComplexId])) + "\n"); + } + } } resultWriter.writeData(result.c_str(), result.length(), qComplexId); + resultWrite5.writeData(result5.c_str(), result5.length(), 0); } } resultWriter.close(true); + resultWrite5.close(true); alnDbr.close(); delete qDbr; if (sameDB == false) { From 7367a247eb41a7a05b164dd42b0e9f7ddf39c338 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 13 Mar 2024 17:45:38 +0900 Subject: [PATCH 058/160] assID, query, target, coverage(1 or 2), tm(1 or 2) --- src/strucclustutils/filtercomplex.cpp | 78 +++++++++++++++++++++------ 1 file changed, 61 insertions(+), 17 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 7c91a6c0..6911e429 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -60,6 +60,49 @@ std::vector selecHighestCoverage( std::map &complexIdtoName, + std::map &chainKeyToComplexIdLookup, + std::map> &complexIdToChainKeysLookup, + std::vector &complexIdVec +) { + if (file.length() == 0) { + return; + } + MemoryMapped lookupDB(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); + char *data = (char *) lookupDB.getData(); + char *end = data + lookupDB.mappedSize(); + const char *entry[255]; + int prevComplexId = -1; + while (data < end && *data != '\0') { + const size_t columns = Util::getWordsOfLine(data, entry, 255); + if (columns < 3) { + Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n"; + continue; + } + auto chainKey = Util::fast_atoi(entry[0]); + std::string chainName(entry[1], (entry[2] - entry[1]) - 1); + auto complexId = Util::fast_atoi(entry[2]); + chainKeyToComplexIdLookup.emplace(chainKey, complexId); + + size_t lastUnderscoreIndex = chainName.find_last_of('_'); + std::string complexName = chainName.substr(0, lastUnderscoreIndex); + + if (complexId != prevComplexId) { + complexIdToChainKeysLookup.emplace(complexId, std::vector()); + complexIdVec.emplace_back(complexId); + complexIdtoName.emplace(complexId, complexName); + prevComplexId = complexId; + } + complexIdToChainKeysLookup.at(complexId).emplace_back(chainKey); + data = Util::skipLine(data); + } + lookupDB.close(); +} + + int filtercomplex(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.parseParameters(argc, argv, command, true, 0, 0); @@ -69,7 +112,6 @@ int filtercomplex(int argc, const char **argv, const Command &command) { std::map qKeyToSet; std::map tKeyToSet; char buffer[32]; - char buffer5[32]; IndexReader* qDbr; qDbr = new IndexReader(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); @@ -103,9 +145,10 @@ int filtercomplex(int argc, const char **argv, const Command &command) { Matcher::result_t res; std::map qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; std::map> qComplexIdToChainKeyMap, tComplexIdToChainKeyMap; + std::map qcomplexIdToName, tcomplexIdToName; std::vector qComplexIdVec, tComplexIdVec; - getKeyToIdMapIdToKeysMapIdVec(qLookupFile, qChainKeyToComplexIdMap, qComplexIdToChainKeyMap, qComplexIdVec); - getKeyToIdMapIdToKeysMapIdVec(tLookupFile, tChainKeyToComplexIdMap, tComplexIdToChainKeyMap, tComplexIdVec); + getlookupInfo(qLookupFile, qcomplexIdToName,qChainKeyToComplexIdMap, qComplexIdToChainKeyMap, qComplexIdVec); + getlookupInfo(tLookupFile, tcomplexIdToName, tChainKeyToComplexIdMap, tComplexIdToChainKeyMap, tComplexIdVec); qChainKeyToComplexIdMap.clear(); Debug::Progress progress(qComplexIdVec.size()); std::vector complexResults; @@ -141,8 +184,8 @@ int filtercomplex(int argc, const char **argv, const Command &command) { } for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { - std::map qcovSum; - std::map tcovSum; + std::map qcovSum, tcovSum; + std::map qtmScores, ttmScores; unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; std::map assIdTodbKey; std::vector &qChainKeys = qComplexIdToChainKeyMap[qComplexId]; @@ -164,6 +207,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { if (qcovSum.find(assId) == qcovSum.end()) { qcovSum[assId] = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); assIdTodbKey.emplace(assId, res.dbKey); + qtmScores.emplace(assId, retComplex.qTmScore); } else{ qcovSum[assId] += (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); @@ -171,6 +215,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { if (tcovSum.find(assId) == tcovSum.end()) { tcovSum[assId] = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); assIdTodbKey.emplace(assId, res.dbKey); + ttmScores.emplace(assId, retComplex.tTmScore); } else{ tcovSum[assId] += (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); @@ -222,27 +267,26 @@ int filtercomplex(int argc, const char **argv, const Command &command) { break; case Parameters::COV_MODE_LENGTH_SHORTER : break; - - // TODO : other coverage modes } for (const auto& pair : qcovSum){ if (std::find(selectedAssIDs.begin(), selectedAssIDs.end(), pair.first) != selectedAssIDs.end()){ char *outpos = Itoa::u32toa_sse2(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]], buffer); result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); - if (par.covMode == Parameters::COV_MODE_BIDIRECTIONAL) { - result5.append(std::to_string(qComplexId) + "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(pair.second/static_cast(qComplexLength[qComplexId])) + "\t" + std::to_string(tcovSum[pair.first]/ static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]])) + "\n"); - } - else if (par.covMode == Parameters::COV_MODE_TARGET){ - result5.append(std::to_string(qComplexId) + "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(tcovSum[pair.first]/ static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]])) + "\n"); - } - else if (par.covMode == Parameters::COV_MODE_QUERY) { - result5.append(std::to_string(qComplexId) + "\t" + std::to_string(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) + "\t" + std::to_string(pair.second/static_cast(qComplexLength[qComplexId])) + "\n"); - } - + } + if (par.covMode == Parameters::COV_MODE_BIDIRECTIONAL) { + result5.append(std::to_string(pair.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] + "\t" + std::to_string(pair.second/static_cast(qComplexLength[qComplexId])) + "\t" + std::to_string(tcovSum[pair.first]/ static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]])) +"\t"+ std::to_string(qtmScores[pair.first])+"\t"+ std::to_string(ttmScores[pair.first])+ "\n"); + } + else if (par.covMode == Parameters::COV_MODE_TARGET){ + result5.append(std::to_string(pair.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] + "\t" + std::to_string(tcovSum[pair.first]/ static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]])) +"\t"+ std::to_string(ttmScores[pair.first])+ "\n"); + + } + else if (par.covMode == Parameters::COV_MODE_QUERY) { + result5.append(std::to_string(pair.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] + "\t" + std::to_string(pair.second/static_cast(qComplexLength[qComplexId])) +"\t"+ std::to_string(qtmScores[pair.first])+"\n"); } } + resultWriter.writeData(result.c_str(), result.length(), qComplexId); resultWrite5.writeData(result5.c_str(), result5.length(), 0); } From c129453097ba963409cbc6a41e0d393bdfe7ee4a Mon Sep 17 00:00:00 2001 From: SooyoungCha <97579193+ChaSooyoung@users.noreply.github.com> Date: Wed, 13 Mar 2024 19:05:36 +0900 Subject: [PATCH 059/160] both tm for all cov modes --- src/strucclustutils/filtercomplex.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 6911e429..d1bc2e68 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -279,11 +279,11 @@ int filtercomplex(int argc, const char **argv, const Command &command) { result5.append(std::to_string(pair.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] + "\t" + std::to_string(pair.second/static_cast(qComplexLength[qComplexId])) + "\t" + std::to_string(tcovSum[pair.first]/ static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]])) +"\t"+ std::to_string(qtmScores[pair.first])+"\t"+ std::to_string(ttmScores[pair.first])+ "\n"); } else if (par.covMode == Parameters::COV_MODE_TARGET){ - result5.append(std::to_string(pair.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] + "\t" + std::to_string(tcovSum[pair.first]/ static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]])) +"\t"+ std::to_string(ttmScores[pair.first])+ "\n"); + result5.append(std::to_string(pair.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] + "\t" + std::to_string(tcovSum[pair.first]/ static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]])) +"\t"+ std::to_string(qtmScores[pair.first])+"\t"+ std::to_string(ttmScores[pair.first])+ "\n"); } else if (par.covMode == Parameters::COV_MODE_QUERY) { - result5.append(std::to_string(pair.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] + "\t" + std::to_string(pair.second/static_cast(qComplexLength[qComplexId])) +"\t"+ std::to_string(qtmScores[pair.first])+"\n"); + result5.append(std::to_string(pair.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] + "\t" + std::to_string(pair.second/static_cast(qComplexLength[qComplexId])) +"\t"+ std::to_string(qtmScores[pair.first])+"\t"+ std::to_string(ttmScores[pair.first])+"\n"); } } @@ -299,4 +299,4 @@ int filtercomplex(int argc, const char **argv, const Command &command) { delete tDbr; } return EXIT_SUCCESS; -} \ No newline at end of file +} From 52f0459aec8569643307739a233be903c4917596 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 13 Mar 2024 21:41:17 +0900 Subject: [PATCH 060/160] TODO maybe TMthreshold --- src/strucclustutils/filtercomplex.cpp | 45 ++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index d1bc2e68..98c78f9f 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -17,10 +17,52 @@ #include #endif +// static bool hasTM(float TMThr, int covMode, double qTM, double tTM){ +// switch (covMode) { +// case Parameters::COV_MODE_BIDIRECTIONAL: +// if (qTM >= TMThr && tTM >= TMThr) { +// return true; +// } +// else{ +// return false; +// } +// break; +// case Parameters::COV_MODE_TARGET: +// if (tTM >= TMThr) { +// return true; +// } +// else{ +// return false; +// } +// break; +// case Parameters::COV_MODE_QUERY: +// if (qTM >= TMThr) { +// return true; +// } +// else{ +// return false; +// } +// break; +// case Parameters::COV_MODE_LENGTH_QUERY : +// return true; +// break; +// case Parameters::COV_MODE_LENGTH_TARGET : +// return true; +// break; +// case Parameters::COV_MODE_LENGTH_SHORTER : +// return true; +// break; +// } +// } + +// bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr, double qTM, double tTM, float TMThr) { bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr) { const bool covOK = Util::hasCoverage(covThr, covMode, qcov, dbcov); + // const bool TMOK = hasTM(TMThr, covMode, qTM, tTM); if ( - covOK + covOK + // covOK && + // TMOK ) { return true; } else { @@ -228,6 +270,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { for (const auto& pair : qcovSum){ float qcov = static_cast(pair.second) / static_cast(qComplexLength[qComplexId]); float dbcov = static_cast(tcovSum[pair.first]) / static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]]); + // if (!checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr, qtmScores[pair.first], ttmScores[pair.first], par.filterTMThr)){ if (!checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr)){ keysToDelete.push_back(pair.first); } From a27efde9581d6f130dc885bbcb77a3f4744b5f15 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 14 Mar 2024 14:32:02 +0900 Subject: [PATCH 061/160] tmthreshold parameter --- src/commons/LocalParameters.cpp | 6 +- src/commons/LocalParameters.h | 2 + src/strucclustutils/filtercomplex.cpp | 90 +++++++++++++-------------- 3 files changed, 52 insertions(+), 46 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 955f7b12..368439fa 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -30,7 +30,9 @@ LocalParameters::LocalParameters() : PARAM_COMPLEX_REPORT_MODE(PARAM_COMPLEX_REPORT_MODE_ID, "--complex-report-mode", "Complex report mode", "Complex report mode:\n0: No report\n1: Write complex report", typeid(int), (void *) &complexReportMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), PARAM_EXPAND_COMPLEX_EVALUE(PARAM_EXPAND_COMPLEX_EVALUE_ID, "--expand-complex-evalue", "E-value threshold for expandcomplex", "E-value threshold for expandcomplex (range 0.0-inf)", typeid(double), (void *) &eValueThrExpandComplex, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN), PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"), - PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC) + PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), + PARAM_FILT_TM_THRESHOLD(PARAM_FILT_TM_THRESHOLD_ID,"--filter-tm-threshold", "TMscore threshold for filtercomplex", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$") + { PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id"; PARAM_ALIGNMENT_MODE.regex = "^[0-3]{1}$"; @@ -183,6 +185,7 @@ LocalParameters::LocalParameters() : filtercomplex.push_back(&PARAM_THREADS); filtercomplex.push_back(&PARAM_C); filtercomplex.push_back(&PARAM_COV_MODE); + filtercomplex.push_back(&PARAM_FILT_TM_THRESHOLD); // createcomplexreport createcomplexreport.push_back(&PARAM_DB_OUTPUT); @@ -244,6 +247,7 @@ LocalParameters::LocalParameters() : complexReportMode = 1; eValueThrExpandComplex = 10000.0; citations.emplace(CITATION_FOLDSEEK, "van Kempen, M., Kim, S.S., Tumescheit, C., Mirdita, M., Lee, J., Gilchrist, C.L.M., Söding, J., and Steinegger, M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)"); + filtTmThr = 0.5; //rewrite param vals. PARAM_FORMAT_OUTPUT.description = "Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,fident,nident,qstart,qend,qlen\ntstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,mismatch,qcov,tcov\nqset,qsetid,tset,tsetid,taxid,taxname,taxlineage,\nlddt,lddtfull,qca,tca,t,u,qtmscore,ttmscore,alntmscore,rmsd,prob\ncomplexqtmscore,complexttmscore,complexu,complext,complexassignid\n"; diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 62aeed48..1eec2b03 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -125,6 +125,7 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_EXPAND_COMPLEX_EVALUE) PARAMETER(PARAM_INPUT_FORMAT) PARAMETER(PARAM_PDB_OUTPUT_MODE) + PARAMETER(PARAM_FILT_TM_THRESHOLD) int prefMode; float tmScoreThr; @@ -148,6 +149,7 @@ class LocalParameters : public Parameters { double eValueThrExpandComplex; int inputFormat; int pdbOutputMode; + float filtTmThr; static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy, bool &needQCa, bool &needTCa, bool &needTMaligner, diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 98c78f9f..78213e36 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -17,52 +17,52 @@ #include #endif -// static bool hasTM(float TMThr, int covMode, double qTM, double tTM){ -// switch (covMode) { -// case Parameters::COV_MODE_BIDIRECTIONAL: -// if (qTM >= TMThr && tTM >= TMThr) { -// return true; -// } -// else{ -// return false; -// } -// break; -// case Parameters::COV_MODE_TARGET: -// if (tTM >= TMThr) { -// return true; -// } -// else{ -// return false; -// } -// break; -// case Parameters::COV_MODE_QUERY: -// if (qTM >= TMThr) { -// return true; -// } -// else{ -// return false; -// } -// break; -// case Parameters::COV_MODE_LENGTH_QUERY : -// return true; -// break; -// case Parameters::COV_MODE_LENGTH_TARGET : -// return true; -// break; -// case Parameters::COV_MODE_LENGTH_SHORTER : -// return true; -// break; -// } -// } +static bool hasTM(float TMThr, int covMode, double qTM, double tTM){ + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + if (qTM >= TMThr && tTM >= TMThr) { + return true; + } + else{ + return false; + } + break; + case Parameters::COV_MODE_TARGET: + if (tTM >= TMThr) { + return true; + } + else{ + return false; + } + break; + case Parameters::COV_MODE_QUERY: + if (qTM >= TMThr) { + return true; + } + else{ + return false; + } + break; + case Parameters::COV_MODE_LENGTH_QUERY : + return true; + break; + case Parameters::COV_MODE_LENGTH_TARGET : + return true; + break; + case Parameters::COV_MODE_LENGTH_SHORTER : + return true; + break; + } +} -// bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr, double qTM, double tTM, float TMThr) { -bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr) { +bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr, double qTM, double tTM, float TMThr) { +// bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr) { const bool covOK = Util::hasCoverage(covThr, covMode, qcov, dbcov); - // const bool TMOK = hasTM(TMThr, covMode, qTM, tTM); + const bool TMOK = hasTM(TMThr, covMode, qTM, tTM); if ( - covOK - // covOK && - // TMOK + // covOK + covOK && + TMOK ) { return true; } else { @@ -270,8 +270,8 @@ int filtercomplex(int argc, const char **argv, const Command &command) { for (const auto& pair : qcovSum){ float qcov = static_cast(pair.second) / static_cast(qComplexLength[qComplexId]); float dbcov = static_cast(tcovSum[pair.first]) / static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]]); - // if (!checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr, qtmScores[pair.first], ttmScores[pair.first], par.filterTMThr)){ - if (!checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr)){ + if (!checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr, qtmScores[pair.first], ttmScores[pair.first], par.filtTmThr)){ + // if (!checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr)){ keysToDelete.push_back(pair.first); } } From 4e7c3624b3311b61fffecbbca1fcf7fbaf7daa16 Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 21 Mar 2024 22:14:39 +0900 Subject: [PATCH 062/160] Revised code of filter complex --- src/commons/LocalParameters.cpp | 9 +- src/commons/LocalParameters.h | 2 + src/strucclustutils/createcomplexreport.cpp | 1 + src/strucclustutils/filtercomplex.cpp | 235 +++++++++----------- src/workflow/ComplexCluster.cpp | 5 +- 5 files changed, 112 insertions(+), 140 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 368439fa..3713403c 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -31,8 +31,9 @@ LocalParameters::LocalParameters() : PARAM_EXPAND_COMPLEX_EVALUE(PARAM_EXPAND_COMPLEX_EVALUE_ID, "--expand-complex-evalue", "E-value threshold for expandcomplex", "E-value threshold for expandcomplex (range 0.0-inf)", typeid(double), (void *) &eValueThrExpandComplex, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN), PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"), PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), - PARAM_FILT_TM_THRESHOLD(PARAM_FILT_TM_THRESHOLD_ID,"--filter-tm-threshold", "TMscore threshold for filtercomplex", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$") - + PARAM_FILT_TM_THRESHOLD(PARAM_FILT_TM_THRESHOLD_ID,"--filter-tm-threshold", "TMscore threshold for filtercomplex", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), + PARAM_FILT_CHAIN_TM_THRESHOLD(PARAM_FILT_CHAIN_TM_THRESHOLD_ID,"--min-chain-tm", "per chain TMscore threshold for filtercomplex", "accept alignments satisfying tmscores of all chains > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$") + { PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id"; PARAM_ALIGNMENT_MODE.regex = "^[0-3]{1}$"; @@ -186,6 +187,7 @@ LocalParameters::LocalParameters() : filtercomplex.push_back(&PARAM_C); filtercomplex.push_back(&PARAM_COV_MODE); filtercomplex.push_back(&PARAM_FILT_TM_THRESHOLD); + filtercomplex.push_back(&PARAM_FILT_CHAIN_TM_THRESHOLD); // createcomplexreport createcomplexreport.push_back(&PARAM_DB_OUTPUT); @@ -247,7 +249,8 @@ LocalParameters::LocalParameters() : complexReportMode = 1; eValueThrExpandComplex = 10000.0; citations.emplace(CITATION_FOLDSEEK, "van Kempen, M., Kim, S.S., Tumescheit, C., Mirdita, M., Lee, J., Gilchrist, C.L.M., Söding, J., and Steinegger, M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)"); - filtTmThr = 0.5; + filtTmThr = 0.0; + filtChainTmThr = 0.0; //rewrite param vals. PARAM_FORMAT_OUTPUT.description = "Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,fident,nident,qstart,qend,qlen\ntstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,mismatch,qcov,tcov\nqset,qsetid,tset,tsetid,taxid,taxname,taxlineage,\nlddt,lddtfull,qca,tca,t,u,qtmscore,ttmscore,alntmscore,rmsd,prob\ncomplexqtmscore,complexttmscore,complexu,complext,complexassignid\n"; diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 1eec2b03..2fe646ed 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -126,6 +126,7 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_INPUT_FORMAT) PARAMETER(PARAM_PDB_OUTPUT_MODE) PARAMETER(PARAM_FILT_TM_THRESHOLD) + PARAMETER(PARAM_FILT_CHAIN_TM_THRESHOLD) int prefMode; float tmScoreThr; @@ -150,6 +151,7 @@ class LocalParameters : public Parameters { int inputFormat; int pdbOutputMode; float filtTmThr; + float filtChainTmThr; static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy, bool &needQCa, bool &needTCa, bool &needTMaligner, diff --git a/src/strucclustutils/createcomplexreport.cpp b/src/strucclustutils/createcomplexreport.cpp index 635dacd9..1b9c0464 100644 --- a/src/strucclustutils/createcomplexreport.cpp +++ b/src/strucclustutils/createcomplexreport.cpp @@ -162,6 +162,7 @@ int createcomplexreport(int argc, const char **argv, const Command &command) { compAlns[compAlnIdx].qChainNames.emplace_back(queryChainName); compAlns[compAlnIdx].tChainNames.emplace_back(targetChainName); } + } // while end } for (size_t compAlnIdx = 0; compAlnIdx < compAlns.size(); compAlnIdx++) { diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 78213e36..4c9f3838 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -17,72 +17,72 @@ #include #endif + + +unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return (qcov+tcov)/2; + case Parameters::COV_MODE_TARGET: + return qcov; + case Parameters::COV_MODE_QUERY: + return tcov; + case Parameters::COV_MODE_LENGTH_QUERY : + case Parameters::COV_MODE_LENGTH_TARGET : + case Parameters::COV_MODE_LENGTH_SHORTER : + return 0; + default: + return 0; + } +} + static bool hasTM(float TMThr, int covMode, double qTM, double tTM){ switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: - if (qTM >= TMThr && tTM >= TMThr) { - return true; - } - else{ - return false; - } - break; + return ((qTM>= TMThr) && (tTM >= TMThr)); case Parameters::COV_MODE_TARGET: - if (tTM >= TMThr) { - return true; - } - else{ - return false; - } - break; + return (tTM >= TMThr); case Parameters::COV_MODE_QUERY: - if (qTM >= TMThr) { - return true; - } - else{ - return false; - } - break; + return (qTM >= TMThr); case Parameters::COV_MODE_LENGTH_QUERY : - return true; - break; case Parameters::COV_MODE_LENGTH_TARGET : - return true; - break; case Parameters::COV_MODE_LENGTH_SHORTER : return true; - break; + default: + return true; } } -bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr, double qTM, double tTM, float TMThr) { -// bool checkFilterCriteria(float qcov, float dbcov, int covMode, float covThr) { - const bool covOK = Util::hasCoverage(covThr, covMode, qcov, dbcov); - const bool TMOK = hasTM(TMThr, covMode, qTM, tTM); - if ( - // covOK - covOK && - TMOK - ) { - return true; - } else { - return false; +struct ComplexFilterCriteria { + ComplexFilterCriteria() {} + ComplexFilterCriteria(unsigned int dbKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qTM, double tTM, std::vector qChainTmScores, std::vector tChainTmScores) : + dbKey(dbKey), qTotalAlnLen(qTotalAlnLen), tTotalAlnLen(tTotalAlnLen), qTM(qTM), tTM(tTM), qChainTmScores(qChainTmScores), tChainTmScores(tChainTmScores) {} + + bool satisfyFilterCriteria(int covMode, float covThr, float TMThr) { + const bool covOK = Util::hasCoverage(covThr, covMode, qCov, tCov); + const bool TMOK = hasTM(TMThr, covMode, qTM, tTM); + return (covOK && TMOK); } -} -unsigned int getQueryResidueLength( IndexReader& qDbr, std::vector &qChainKeys) { - unsigned int qResidueLen = 0; - for (auto qChainKey: qChainKeys) { - size_t id = qDbr.sequenceReader->getId(qChainKey); - // Not accessible - if (id == NOT_AVAILABLE_CHAIN_KEY) - return 0; - qResidueLen += qDbr.sequenceReader->getSeqLen(id); - } - return qResidueLen; -} + unsigned int dbKey; + unsigned int qTotalAlnLen; + unsigned int tTotalAlnLen; + float qCov; + float tCov; + double qTM; + double tTM; + std::vector qChainTmScores; //TODO + std::vector tChainTmScores; //TODO + // std::vector alignedQChainKeys; //TODO + // std::vector alignedTChainKeys; //TODO +}; -unsigned int getTargetResidueLength( IndexReader *qDbr, std::vector &qChainKeys) { +//TODO +// std::vector computeChainTmScore() { +// std::vector tmScores; +// return tmScores; +// } +unsigned int getComplexResidueLength( IndexReader *qDbr, std::vector &qChainKeys) { unsigned int qResidueLen = 0; for (auto qChainKey: qChainKeys) { size_t id = qDbr->sequenceReader->getId(qChainKey); @@ -94,15 +94,6 @@ unsigned int getTargetResidueLength( IndexReader *qDbr, std::vector selecHighestCoverage( std::map> &covMap){ - std::vector assIdvec; - for (auto pair : covMap){ - assIdvec.push_back(pair.second.rbegin()->second); - } - return assIdvec; -} - - static void getlookupInfo( const std::string &file, std::map &complexIdtoName, @@ -144,7 +135,6 @@ static void getlookupInfo( lookupDB.close(); } - int filtercomplex(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.parseParameters(argc, argv, command, true, 0, 0); @@ -193,7 +183,6 @@ int filtercomplex(int argc, const char **argv, const Command &command) { getlookupInfo(tLookupFile, tcomplexIdToName, tChainKeyToComplexIdMap, tComplexIdToChainKeyMap, tComplexIdVec); qChainKeyToComplexIdMap.clear(); Debug::Progress progress(qComplexIdVec.size()); - std::vector complexResults; std::map tComplexLength; std::map qComplexLength; @@ -204,15 +193,15 @@ int filtercomplex(int argc, const char **argv, const Command &command) { thread_idx = static_cast(omp_get_thread_num()); #endif Matcher::result_t res; - std::vector localComplexResults; #pragma omp for schedule(dynamic, 10) nowait + for (size_t tComplexIdx = 0; tComplexIdx < tComplexIdVec.size(); tComplexIdx++) { unsigned int tComplexId = tComplexIdVec[tComplexIdx]; std::vector &tChainKeys = tComplexIdToChainKeyMap[tComplexId]; if (tChainKeys.empty()) { continue; } - unsigned int reslen = getTargetResidueLength(tDbr, tChainKeys); + unsigned int reslen = getComplexResidueLength(tDbr, tChainKeys); tComplexLength[tComplexId] =reslen; } for (size_t qComplexIdx = 0; qComplexIdx < qComplexIdVec.size(); qComplexIdx++) { @@ -221,16 +210,15 @@ int filtercomplex(int argc, const char **argv, const Command &command) { if (qChainKeys.empty()) { continue; } - unsigned int reslen = getTargetResidueLength(qDbr, qChainKeys); + unsigned int reslen = getComplexResidueLength(qDbr, qChainKeys); qComplexLength[qComplexId] = reslen; } for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { - std::map qcovSum, tcovSum; - std::map qtmScores, ttmScores; + std::map localComplexCriteria; unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; - std::map assIdTodbKey; std::vector &qChainKeys = qComplexIdToChainKeyMap[qComplexId]; + for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { unsigned int qChainKey = qChainKeys[qChainIdx]; unsigned int qChainDbKey = alnDbr.getId(qChainKey); @@ -244,92 +232,69 @@ int filtercomplex(int argc, const char **argv, const Command &command) { Debug(Debug::ERROR) << "No scorecomplex result provided"; EXIT(EXIT_FAILURE); } + data = Util::skipLine(data); unsigned int assId = retComplex.assId; - if (qcovSum.find(assId) == qcovSum.end()) { - qcovSum[assId] = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - assIdTodbKey.emplace(assId, res.dbKey); - qtmScores.emplace(assId, retComplex.qTmScore); - } - else{ - qcovSum[assId] += (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - } - if (tcovSum.find(assId) == tcovSum.end()) { - tcovSum[assId] = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - assIdTodbKey.emplace(assId, res.dbKey); - ttmScores.emplace(assId, retComplex.tTmScore); - } - else{ - tcovSum[assId] += (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + + if (localComplexCriteria.find(assId) == localComplexCriteria.end()) { + localComplexCriteria[assId] = ComplexFilterCriteria(); + localComplexCriteria[assId].dbKey = res.dbKey; + localComplexCriteria[assId].qTotalAlnLen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + localComplexCriteria[assId].tTotalAlnLen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + localComplexCriteria[assId].qTM = retComplex.qTmScore; + localComplexCriteria[assId].tTM = retComplex.tTmScore; + // localComplexCriteria[assId].qChainTmScores = computeChainTmScore(); //TODO + // localComplexCriteria[assId].tChainTmScores = computeChainTmScore(); //TODO + } else { + localComplexCriteria[assId].qTotalAlnLen += (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + localComplexCriteria[assId].tTotalAlnLen += (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); } } } std::string result; std::string result5; - std::vector keysToDelete; - for (const auto& pair : qcovSum){ - float qcov = static_cast(pair.second) / static_cast(qComplexLength[qComplexId]); - float dbcov = static_cast(tcovSum[pair.first]) / static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]]); - if (!checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr, qtmScores[pair.first], ttmScores[pair.first], par.filtTmThr)){ - // if (!checkFilterCriteria(qcov, dbcov, par.covMode, par.covThr)){ - keysToDelete.push_back(pair.first); - } + std::vector assIdsToDelete; + + for (auto& assId_res : localComplexCriteria){ + unsigned int tComplexId = tChainKeyToComplexIdMap[assId_res.second.dbKey]; + assId_res.second.qCov = static_cast(assId_res.second.qTotalAlnLen) / static_cast(qComplexLength[qComplexId]); + assId_res.second.tCov = static_cast(assId_res.second.tTotalAlnLen) / static_cast(tComplexLength[tComplexId]); + if (!assId_res.second.satisfyFilterCriteria(par.covMode, par.covThr, par.filtTmThr)){ + assIdsToDelete.push_back(assId_res.first); + } } - for (const auto& key : keysToDelete) { - qcovSum.erase(key); - tcovSum.erase(key); + for (const auto& key : assIdsToDelete) { + localComplexCriteria.erase(key); } - - std::map> qcompIdToassIdToalnSum, tcompIdToassIdToalnSum, avgcompIdToassIdToalnSum; - for (const auto& pair : qcovSum){ - if (qcompIdToassIdToalnSum.find(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]) == qcompIdToassIdToalnSum.end()){ - qcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] = {{pair.second, pair.first}}; - tcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] ={{ tcovSum[pair.first], pair.first}}; - avgcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] = {{(pair.second+tcovSum[pair.first])/2, pair.first}}; + + std::map> cmplIdToBestAssId; // cmplId : [assId, alnSum] + for (const auto& assId_res : localComplexCriteria){ + unsigned int tComplexId = tChainKeyToComplexIdMap[assId_res.second.dbKey]; + unsigned int alnlen = adjustAlnLen(assId_res.second.qTotalAlnLen, assId_res.second.tTotalAlnLen, par.covMode); + if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()){ + cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } else{ - qcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]][pair.second] = pair.first; - tcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]][tcovSum[pair.first]] = pair.first; - avgcompIdToassIdToalnSum[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]][(pair.second + tcovSum[pair.first])/2] = pair.first ; + if (alnlen > cmplIdToBestAssId[tComplexId][1]){ + cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + } } } + std::vector selectedAssIDs; - switch (par.covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - selectedAssIDs = selecHighestCoverage(avgcompIdToassIdToalnSum); - break; - case Parameters::COV_MODE_TARGET: - selectedAssIDs = selecHighestCoverage(tcompIdToassIdToalnSum); - break; - case Parameters::COV_MODE_QUERY: - selectedAssIDs = selecHighestCoverage(qcompIdToassIdToalnSum); - break; - case Parameters::COV_MODE_LENGTH_QUERY : - break; - case Parameters::COV_MODE_LENGTH_TARGET : - break; - case Parameters::COV_MODE_LENGTH_SHORTER : - break; + for (const auto& pair : cmplIdToBestAssId){ + selectedAssIDs.push_back(pair.second[0]); } - for (const auto& pair : qcovSum){ - if (std::find(selectedAssIDs.begin(), selectedAssIDs.end(), pair.first) != selectedAssIDs.end()){ - char *outpos = Itoa::u32toa_sse2(tChainKeyToComplexIdMap[assIdTodbKey[pair.first]], buffer); + + for (const auto& assId_res : localComplexCriteria){ + unsigned int tComplexId = tChainKeyToComplexIdMap[assId_res.second.dbKey]; + if (std::find(selectedAssIDs.begin(), selectedAssIDs.end(), assId_res.first) != selectedAssIDs.end()){ + char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); - - } - if (par.covMode == Parameters::COV_MODE_BIDIRECTIONAL) { - result5.append(std::to_string(pair.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] + "\t" + std::to_string(pair.second/static_cast(qComplexLength[qComplexId])) + "\t" + std::to_string(tcovSum[pair.first]/ static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]])) +"\t"+ std::to_string(qtmScores[pair.first])+"\t"+ std::to_string(ttmScores[pair.first])+ "\n"); - } - else if (par.covMode == Parameters::COV_MODE_TARGET){ - result5.append(std::to_string(pair.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] + "\t" + std::to_string(tcovSum[pair.first]/ static_cast(tComplexLength[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]])) +"\t"+ std::to_string(qtmScores[pair.first])+"\t"+ std::to_string(ttmScores[pair.first])+ "\n"); - - } - else if (par.covMode == Parameters::COV_MODE_QUERY) { - result5.append(std::to_string(pair.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tChainKeyToComplexIdMap[assIdTodbKey[pair.first]]] + "\t" + std::to_string(pair.second/static_cast(qComplexLength[qComplexId])) +"\t"+ std::to_string(qtmScores[pair.first])+"\t"+ std::to_string(ttmScores[pair.first])+"\n"); + result5.append(std::to_string(assId_res.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tComplexId] + "\t" + std::to_string(assId_res.second.qCov) + "\t" + std::to_string(assId_res.second.tCov) + "\t"+ std::to_string(assId_res.second.qTM)+"\t"+ std::to_string(assId_res.second.tTM)+ "\n"); } } - resultWriter.writeData(result.c_str(), result.length(), qComplexId); resultWrite5.writeData(result5.c_str(), result5.length(), 0); } diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index f06c15cf..191225c5 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -8,8 +8,10 @@ #include "complexcluster.sh.h" -void setComplexClusterDefaults(Parameters *p) { +void setComplexClusterDefaults(LocalParameters *p) { p->covThr = 0.8; + p->filtTmThr = 0.5; // FIX + p->filtChainTmThr=0.0; // FIX p->covMode = 1; p->clusteringMode = Parameters::GREEDY; p->removeTmpFiles = true; @@ -33,7 +35,6 @@ int complexcluster(int argc, const char **argv, const Command &command) { par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); - setComplexClusterDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); setComplexClusterMustPassAlong(&par); From 99251fd6bb99dd6af8708bd08205540d995f25df Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 26 Mar 2024 15:34:52 +0900 Subject: [PATCH 063/160] complexheader, but still issue exists --- data/complexcluster.sh | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index ca3deeb7..bb66f5dc 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -48,6 +48,20 @@ buildCmplDb() { cp "${1}.dbtype" "${2}.dbtype" } +buildCmplhName(){ + awk -F'\t| ' '{match($1, /(pdb|cif)/); + file_name=substr($1, 1, RSTART+RLENGTH-1); + print file_name }' "${1}" > "${1}_name" + + awk '{sub(/^[^ ]* /, ""); print}' "${1}" > "${1}_header" + + paste -d' ' "${1}_name" "${1}_header" > "${2}_tmp" + + paste -d'\t' "${1}_name" "${2}_tmp" > "${2}_redundant" + + awk '!seen[$1]++' "${2}_redundant" > "${2}" +} + # [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; if notExists "${TMP_PATH}/complex_result.dbtype"; then @@ -71,7 +85,14 @@ fi # Shift _h, _h.dbtype if notExists "${TMP_PATH}/complex_db_h.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" tsv2db "${INPUT}.source" "${TMP_PATH}/complex_db_h" ${VERBOSITY_PAR} \ + "$MMSEQS" tsv2db "${INPUT}.source" "${TMP_PATH}/complex_db_header_tmp" ${VERBOSITY_PAR} \ + || fail "tsv2db died" + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h_tmp" ${VERBOSITY_PAR} \ + || fail "createtsv died" + buildCmplhName "${TMP_PATH}/chain_db_h_tmp" "${TMP_PATH}/complex_db_header.tsv" + # shellcheck disable=SC2086 + "$MMSEQS" tsv2db "${TMP_PATH}/complex_db_header.tsv" "${TMP_PATH}/complex_db_h" ${VERBOSITY_PAR} \ || fail "tsv2db died" fi @@ -88,6 +109,12 @@ if [ -n "${REMOVE_TMP}" ]; then "$MMSEQS" rmdb "${TMP_PATH}/complex_filt" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_result" ${VERBOSITY_PAR} + # shellcheck disable=SC2086 + "$MMSEQS" rmdb "${TMP_PATH}/complex_db_h_tmp" ${VERBOSITY_PAR} + rm "${TMP_PATH}/chain_db_h_tmp" + rm "${TMP_PATH}/chain_db_h_tmp_name" + rm "${TMP_PATH}/chain_db_h_tmp_header" + rm "${TMP_PATH}/complex_db_header.tsv_redundant" rm -rf "${TMP_PATH}/complexsearch_tmp" rm -f "${TMP_PATH}/complexcluster.sh" fi \ No newline at end of file From 5e47a2ac50562ff8293483503d7a44712483e2eb Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 26 Mar 2024 15:45:25 +0900 Subject: [PATCH 064/160] still --- data/complexcluster.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index bb66f5dc..7dea47e4 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -84,9 +84,9 @@ fi # Shift _h, _h.dbtype if notExists "${TMP_PATH}/complex_db_h.dbtype"; then - # shellcheck disable=SC2086 - "$MMSEQS" tsv2db "${INPUT}.source" "${TMP_PATH}/complex_db_header_tmp" ${VERBOSITY_PAR} \ - || fail "tsv2db died" + # # shellcheck disable=SC2086 + # "$MMSEQS" tsv2db "${INPUT}.source" "${TMP_PATH}/complex_db_header_tmp" ${VERBOSITY_PAR} \ + # || fail "tsv2db died" # shellcheck disable=SC2086 "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h_tmp" ${VERBOSITY_PAR} \ || fail "createtsv died" @@ -110,10 +110,11 @@ if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_result" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_db_h_tmp" ${VERBOSITY_PAR} + # "$MMSEQS" rmdb "${TMP_PATH}/complex_db_header_tmp" ${VERBOSITY_PAR} rm "${TMP_PATH}/chain_db_h_tmp" rm "${TMP_PATH}/chain_db_h_tmp_name" rm "${TMP_PATH}/chain_db_h_tmp_header" + rm "${TMP_PATH}/complex_db_header.tsv_tmp" rm "${TMP_PATH}/complex_db_header.tsv_redundant" rm -rf "${TMP_PATH}/complexsearch_tmp" rm -f "${TMP_PATH}/complexcluster.sh" From 94da95c4fe2789e3a023633697e66620be10e2f4 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 26 Mar 2024 23:27:04 +0900 Subject: [PATCH 065/160] complex header make --- data/complexcluster.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index 7dea47e4..0113e836 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -59,7 +59,10 @@ buildCmplhName(){ paste -d'\t' "${1}_name" "${2}_tmp" > "${2}_redundant" - awk '!seen[$1]++' "${2}_redundant" > "${2}" + awk '!seen[$1]++' "${2}_redundant" > "${2}_nameheader" + + awk -F'\t' '{print NR-1 "\t" $2}' "${2}_nameheader" > "${2}" + } # [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; @@ -116,6 +119,7 @@ if [ -n "${REMOVE_TMP}" ]; then rm "${TMP_PATH}/chain_db_h_tmp_header" rm "${TMP_PATH}/complex_db_header.tsv_tmp" rm "${TMP_PATH}/complex_db_header.tsv_redundant" + rm "${TMP_PATH}/complex_db_header.tsv_tmp_nameheader" rm -rf "${TMP_PATH}/complexsearch_tmp" rm -f "${TMP_PATH}/complexcluster.sh" fi \ No newline at end of file From 6ef9dc7be9070d803124a1e647718e668fe4cfba Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 28 Mar 2024 11:08:03 +0900 Subject: [PATCH 066/160] modified complex header --- data/complexcluster.sh | 55 +++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index 0113e836..681db924 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -48,23 +48,34 @@ buildCmplDb() { cp "${1}.dbtype" "${2}.dbtype" } -buildCmplhName(){ - awk -F'\t| ' '{match($1, /(pdb|cif)/); - file_name=substr($1, 1, RSTART+RLENGTH-1); - print file_name }' "${1}" > "${1}_name" - - awk '{sub(/^[^ ]* /, ""); print}' "${1}" > "${1}_header" - - paste -d' ' "${1}_name" "${1}_header" > "${2}_tmp" - - paste -d'\t' "${1}_name" "${2}_tmp" > "${2}_redundant" - - awk '!seen[$1]++' "${2}_redundant" > "${2}_nameheader" - - awk -F'\t' '{print NR-1 "\t" $2}' "${2}_nameheader" > "${2}" - +buldCmplhDb(){ + awk -F"\t" '{ + split($2,words," ") + split(words[1],parts,"_") + output_string="" + for (j = 1; j < length(parts); j++) { + output_string = output_string parts[j] + if (j < length(parts)-1){ + output_string=output_string"_" + } + } + for (t = 2; t < length(words); t++) { + output_string=output_string" "words[t] + } + print output_string + }' "${1}" > "${2}_redundant" + + awk 'BEGIN {index_value=0} + { + if (!seen[$1]) { + print index_value++"\t"$0 + seen[$1] = 1 + } + }' "${2}_redundant" > "${2}" + } + # [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; if notExists "${TMP_PATH}/complex_result.dbtype"; then @@ -93,9 +104,9 @@ if notExists "${TMP_PATH}/complex_db_h.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h_tmp" ${VERBOSITY_PAR} \ || fail "createtsv died" - buildCmplhName "${TMP_PATH}/chain_db_h_tmp" "${TMP_PATH}/complex_db_header.tsv" + buldCmplhDb "${TMP_PATH}/chain_db_h_tmp" "${TMP_PATH}/complex_header.tsv" # shellcheck disable=SC2086 - "$MMSEQS" tsv2db "${TMP_PATH}/complex_db_header.tsv" "${TMP_PATH}/complex_db_h" ${VERBOSITY_PAR} \ + "$MMSEQS" tsv2db "${TMP_PATH}/complex_header.tsv" "${TMP_PATH}/complex_db_h" ${VERBOSITY_PAR} \ || fail "tsv2db died" fi @@ -112,14 +123,8 @@ if [ -n "${REMOVE_TMP}" ]; then "$MMSEQS" rmdb "${TMP_PATH}/complex_filt" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_result" ${VERBOSITY_PAR} - # shellcheck disable=SC2086 - # "$MMSEQS" rmdb "${TMP_PATH}/complex_db_header_tmp" ${VERBOSITY_PAR} - rm "${TMP_PATH}/chain_db_h_tmp" - rm "${TMP_PATH}/chain_db_h_tmp_name" - rm "${TMP_PATH}/chain_db_h_tmp_header" - rm "${TMP_PATH}/complex_db_header.tsv_tmp" - rm "${TMP_PATH}/complex_db_header.tsv_redundant" - rm "${TMP_PATH}/complex_db_header.tsv_tmp_nameheader" + rm "${TMP_PATH}/complex_header.tsv" + rm "${TMP_PATH}/complex_header.tsv_redundant" rm -rf "${TMP_PATH}/complexsearch_tmp" rm -f "${TMP_PATH}/complexcluster.sh" fi \ No newline at end of file From e44034e6c05b9ee40faf0cceea378b27a95b17c6 Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 28 Mar 2024 14:16:42 +0900 Subject: [PATCH 067/160] Implemented realloc function in Coordinate.h --- lib/tmalign/Coordinates.h | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/lib/tmalign/Coordinates.h b/lib/tmalign/Coordinates.h index b929987f..90e168d8 100644 --- a/lib/tmalign/Coordinates.h +++ b/lib/tmalign/Coordinates.h @@ -7,7 +7,7 @@ #include "simd.h" struct Coordinates{ - Coordinates(int size){ + Coordinates(int size) : size(size) { x =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); y =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); z =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); @@ -27,5 +27,35 @@ struct Coordinates{ float * x; float * y; float * z; + int size; + + void realloc(int newsize){ + if (allocated == false) { + x =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); + y =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); + z =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); + allocated = true; + size = newsize; + } else { + if (newsize > size) { + float* new_x =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); + float* new_y =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); + float* new_z =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); + + if (x) memcpy(new_x, x, size * sizeof(float)); + if (y) memcpy(new_y, y, size * sizeof(float)); + if (z) memcpy(new_z, z, size * sizeof(float)); + + free(x); + free(y); + free(z); + + x = new_x; + y = new_y; + z = new_z; + size = newsize; + } + } + } }; #endif //STRUCCLUST_COORDINATES_H From 81fbfd996ed019f27611ebbc057e535ae55bc44a Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 28 Mar 2024 14:18:48 +0900 Subject: [PATCH 068/160] Implemented per-chain-tm but tmscore is suspicious --- src/strucclustutils/filtercomplex.cpp | 268 +++++++++++++++++++++----- 1 file changed, 215 insertions(+), 53 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 4c9f3838..92395e1d 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -8,6 +8,8 @@ #include "FileUtil.h" #include "TranslateNucl.h" #include "MemoryMapped.h" +#include "Coordinate16.h" +#include "tmalign/basic_fun.h" #include "createcomplexreport.h" #include "LDDT.h" #include "CalcProbTP.h" @@ -18,7 +20,6 @@ #endif - unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: @@ -55,8 +56,8 @@ static bool hasTM(float TMThr, int covMode, double qTM, double tTM){ struct ComplexFilterCriteria { ComplexFilterCriteria() {} - ComplexFilterCriteria(unsigned int dbKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qTM, double tTM, std::vector qChainTmScores, std::vector tChainTmScores) : - dbKey(dbKey), qTotalAlnLen(qTotalAlnLen), tTotalAlnLen(tTotalAlnLen), qTM(qTM), tTM(tTM), qChainTmScores(qChainTmScores), tChainTmScores(tChainTmScores) {} + ComplexFilterCriteria(unsigned int dbKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qTM, double tTM) : + dbKey(dbKey), qTotalAlnLen(qTotalAlnLen), tTotalAlnLen(tTotalAlnLen), qTM(qTM), tTM(tTM) {} bool satisfyFilterCriteria(int covMode, float covThr, float TMThr) { const bool covOK = Util::hasCoverage(covThr, covMode, qCov, tCov); @@ -71,17 +72,117 @@ struct ComplexFilterCriteria { float tCov; double qTM; double tTM; - std::vector qChainTmScores; //TODO - std::vector tChainTmScores; //TODO - // std::vector alignedQChainKeys; //TODO - // std::vector alignedTChainKeys; //TODO + + std::vector alignedQChainKeys; //TODO + std::vector alignedTChainKeys; //TODO + std::vector alignedQChainTmScores; //TODO + std::vector alignedTChainTmScores; //TODO }; -//TODO -// std::vector computeChainTmScore() { -// std::vector tmScores; -// return tmScores; -// } +void fillUArr(const std::string &uString, float (&u)[3][3]) { + std::string tmp; + int i = 0; + int j=0; + for (auto c : uString) { + if (c == ',') { + u[i][j] = std::stof(tmp); + tmp.clear(); + j++; + } else { + tmp.push_back(c); + } + if (j == 3) { + i++; + j = 0; + } + } +} + +void fillTArr(const std::string &tString, float (&t)[3]) { + std::string tmp; + int i = 0; + for (auto c : tString) { + if (c == ',') { + t[i] = std::stof(tmp); + tmp.clear(); + i++; + } else { + tmp.push_back(c); + } + } +} + +unsigned int fillMatchedCoord(float * qdata, float * tdata, + Coordinates &qm, Coordinates &tm, + const std::string &cigar, int qStartPos, int tStartPos, int qLen, int tLen) { + std::vector qx, qy, qz, tx, ty, tz; + int qi = qStartPos; + int ti = tStartPos; + unsigned int qXPos = 0; + unsigned int qYPos = qLen; + unsigned int qZPos = qLen*2; + unsigned int tXPos = 0; + unsigned int tYPos = tLen; + unsigned int tZPos = tLen*2; + int mi = 0; + + std::string backtrace = Matcher::uncompressAlignment(cigar); + for (size_t btPos = 0; btPos < backtrace.size(); btPos++) { + if (backtrace[btPos] == 'M') { + qx.push_back(qdata[qXPos + qi]); + qy.push_back(qdata[qYPos + qi]); + qz.push_back(qdata[qZPos + qi]); + tx.push_back(tdata[tXPos + ti]); + ty.push_back(tdata[tYPos + ti]); + tz.push_back(tdata[tZPos + ti]); + qi++; + ti++; + mi++; + } + else if (backtrace[btPos] == 'I') { + qi++; + } + else { + ti++; + } + } + qm.realloc(mi); + tm.realloc(mi); + std::copy(qx.begin(), qx.end(), qm.x); + std::copy(qy.begin(), qy.end(), qm.y); + std::copy(qz.begin(), qz.end(), qm.z); + std::copy(tx.begin(), tx.end(), tm.x); + std::copy(ty.begin(), ty.end(), tm.y); + std::copy(tz.begin(), tz.end(), tm.z); + qx.clear(); + qy.clear(); + qz.clear(); + tx.clear(); + ty.clear(); + tz.clear(); + + return mi; +} + +double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u[3][3], unsigned int mlen, int normlen) { + double tmscore = 0; + double d0=0.5; + if (normlen<=21); + else d0=1.24*pow((normlen-15),1.0/3)-1.8; + + // Coordinates tmt(mlen); + // BasicFunction::do_rotation(tm, tmt, mlen, t, u); + double d02 = 1.0; + for (unsigned int k=0; k &qChainKeys) { unsigned int qResidueLen = 0; for (auto qChainKey: qChainKeys) { @@ -141,20 +242,27 @@ int filtercomplex(int argc, const char **argv, const Command &command) { const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); int dbaccessMode = (DBReader::USE_INDEX); - std::map qKeyToSet; - std::map tKeyToSet; char buffer[32]; IndexReader* qDbr; qDbr = new IndexReader(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + DBReader qStructDbr((par.db1 + "_ca").c_str(), (par.db1 + "_ca.index").c_str(), + par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + qStructDbr.open(DBReader::NOSORT); + IndexReader* tDbr; + DBReader *tStructDbr = NULL; if (sameDB) { tDbr = qDbr; + tStructDbr = &qStructDbr; } else{ tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + tStructDbr = new DBReader((par.db2 + "_ca").c_str(), (par.db2 + "_ca.index").c_str(), + par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + tStructDbr->open(DBReader::NOSORT); } - DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX| DBReader::USE_DATA); alnDbr.open(DBReader::LINEAR_ACCCESS); size_t localThreads = 1; @@ -170,21 +278,21 @@ int filtercomplex(int argc, const char **argv, const Command &command) { const int db5Type = Parameters::DBTYPE_GENERIC_DB; DBWriter resultWrite5(par.db5.c_str(), par.db5Index.c_str(), 1, shouldCompress, db5Type); resultWrite5.open(); + resultToWrite_t result5; std::string qLookupFile = par.db1 + ".lookup"; std::string tLookupFile = par.db2 + ".lookup"; TranslateNucl translateNucl(static_cast(par.translationTable)); Matcher::result_t res; - std::map qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; - std::map> qComplexIdToChainKeyMap, tComplexIdToChainKeyMap; + chainKeyToComplexId_t qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; + complexIdToChainKeys_t qComplexIdToChainKeyMap, tComplexIdToChainKeyMap; std::map qcomplexIdToName, tcomplexIdToName; std::vector qComplexIdVec, tComplexIdVec; getlookupInfo(qLookupFile, qcomplexIdToName,qChainKeyToComplexIdMap, qComplexIdToChainKeyMap, qComplexIdVec); getlookupInfo(tLookupFile, tcomplexIdToName, tChainKeyToComplexIdMap, tComplexIdToChainKeyMap, tComplexIdVec); qChainKeyToComplexIdMap.clear(); Debug::Progress progress(qComplexIdVec.size()); - std::map tComplexLength; - std::map qComplexLength; + std::map qComplexLength, tComplexLength; #pragma omp parallel num_threads(localThreads) { @@ -197,7 +305,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { for (size_t tComplexIdx = 0; tComplexIdx < tComplexIdVec.size(); tComplexIdx++) { unsigned int tComplexId = tComplexIdVec[tComplexIdx]; - std::vector &tChainKeys = tComplexIdToChainKeyMap[tComplexId]; + std::vector &tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); if (tChainKeys.empty()) { continue; } @@ -206,7 +314,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { } for (size_t qComplexIdx = 0; qComplexIdx < qComplexIdVec.size(); qComplexIdx++) { unsigned int qComplexId = qComplexIdVec[qComplexIdx]; - std::vector &qChainKeys = qComplexIdToChainKeyMap[qComplexId]; + std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); if (qChainKeys.empty()) { continue; } @@ -214,20 +322,32 @@ int filtercomplex(int argc, const char **argv, const Command &command) { qComplexLength[qComplexId] = reslen; } + for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { - std::map localComplexCriteria; + std::map localComplexMap; unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; - std::vector &qChainKeys = qComplexIdToChainKeyMap[qComplexId]; + std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); + + Coordinate16 qcoords; + Coordinate16 tcoords; for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { unsigned int qChainKey = qChainKeys[qChainIdx]; unsigned int qChainDbKey = alnDbr.getId(qChainKey); + + int qChainLen = qDbr->sequenceReader->getSeqLen(qChainDbKey); + char *qcadata = qStructDbr.getData(qChainDbKey, thread_idx); + size_t qCaLength = qStructDbr.getEntryLen(qChainDbKey); + float* qdata = qcoords.read(qcadata, qChainLen, qCaLength); + if (qChainDbKey == NOT_AVAILABLE_CHAIN_KEY) { continue; } char *data = alnDbr.getData(qChainDbKey, thread_idx); + while (*data) { ComplexDataHandler retComplex = parseScoreComplexResult(data, res); + if (!retComplex.isValid){ Debug(Debug::ERROR) << "No scorecomplex result provided"; EXIT(EXIT_FAILURE); @@ -235,47 +355,71 @@ int filtercomplex(int argc, const char **argv, const Command &command) { data = Util::skipLine(data); unsigned int assId = retComplex.assId; - - if (localComplexCriteria.find(assId) == localComplexCriteria.end()) { - localComplexCriteria[assId] = ComplexFilterCriteria(); - localComplexCriteria[assId].dbKey = res.dbKey; - localComplexCriteria[assId].qTotalAlnLen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - localComplexCriteria[assId].tTotalAlnLen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - localComplexCriteria[assId].qTM = retComplex.qTmScore; - localComplexCriteria[assId].tTM = retComplex.tTmScore; - // localComplexCriteria[assId].qChainTmScores = computeChainTmScore(); //TODO - // localComplexCriteria[assId].tChainTmScores = computeChainTmScore(); //TODO + unsigned int tChainDbKey = res.dbKey; + + // DOING + float u[3][3]; + float t[3]; + Coordinates qm(0), tm(0); + fillUArr(retComplex.uString, u); + fillTArr(retComplex.tString, t); + + int tChainLen = tDbr->sequenceReader->getSeqLen(tChainDbKey); + char *tcadata = tStructDbr->getData(tChainDbKey, thread_idx); + size_t tCaLength = tStructDbr->getEntryLen(tChainDbKey); + float* tdata = tcoords.read(tcadata, tChainLen, tCaLength); + unsigned int normlen = std::max(res.qLen, res.dbLen); + unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); + double chainTm = computeChainTmScore(qm, tm, t, u, match_len, normlen); + double qtm = chainTm * normlen / qChainLen; + double ttm = chainTm * normlen / tChainLen; + + if (localComplexMap.find(assId) == localComplexMap.end()) { + localComplexMap[assId] = ComplexFilterCriteria(); + localComplexMap[assId].dbKey = res.dbKey; + localComplexMap[assId].qTotalAlnLen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + localComplexMap[assId].tTotalAlnLen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + localComplexMap[assId].qTM = retComplex.qTmScore; + localComplexMap[assId].tTM = retComplex.tTmScore; + localComplexMap[assId].alignedQChainKeys.push_back(qChainDbKey); + localComplexMap[assId].alignedTChainKeys.push_back(tChainDbKey); + localComplexMap[assId].alignedQChainTmScores.push_back(qtm); + localComplexMap[assId].alignedTChainTmScores.push_back(ttm); } else { - localComplexCriteria[assId].qTotalAlnLen += (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - localComplexCriteria[assId].tTotalAlnLen += (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + localComplexMap[assId].qTotalAlnLen += (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + localComplexMap[assId].tTotalAlnLen += (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + localComplexMap[assId].alignedQChainKeys.push_back(qChainDbKey); + localComplexMap[assId].alignedTChainKeys.push_back(res.dbKey); + localComplexMap[assId].alignedQChainTmScores.push_back(qtm); + localComplexMap[assId].alignedTChainTmScores.push_back(ttm); } } } std::string result; - std::string result5; + // std::string result5; std::vector assIdsToDelete; - for (auto& assId_res : localComplexCriteria){ - unsigned int tComplexId = tChainKeyToComplexIdMap[assId_res.second.dbKey]; - assId_res.second.qCov = static_cast(assId_res.second.qTotalAlnLen) / static_cast(qComplexLength[qComplexId]); - assId_res.second.tCov = static_cast(assId_res.second.tTotalAlnLen) / static_cast(tComplexLength[tComplexId]); + for (auto& assId_res : localComplexMap){ + unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); + assId_res.second.qCov = static_cast(assId_res.second.qTotalAlnLen) / static_cast(qComplexLength.at(qComplexId)); + assId_res.second.tCov = static_cast(assId_res.second.tTotalAlnLen) / static_cast(tComplexLength.at(tComplexId)); if (!assId_res.second.satisfyFilterCriteria(par.covMode, par.covThr, par.filtTmThr)){ assIdsToDelete.push_back(assId_res.first); } } for (const auto& key : assIdsToDelete) { - localComplexCriteria.erase(key); + localComplexMap.erase(key); } std::map> cmplIdToBestAssId; // cmplId : [assId, alnSum] - for (const auto& assId_res : localComplexCriteria){ - unsigned int tComplexId = tChainKeyToComplexIdMap[assId_res.second.dbKey]; + for (const auto& assId_res : localComplexMap){ + unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); unsigned int alnlen = adjustAlnLen(assId_res.second.qTotalAlnLen, assId_res.second.tTotalAlnLen, par.covMode); if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()){ cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } else{ - if (alnlen > cmplIdToBestAssId[tComplexId][1]){ + if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]){ cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } } @@ -286,19 +430,24 @@ int filtercomplex(int argc, const char **argv, const Command &command) { selectedAssIDs.push_back(pair.second[0]); } - for (const auto& assId_res : localComplexCriteria){ - unsigned int tComplexId = tChainKeyToComplexIdMap[assId_res.second.dbKey]; - if (std::find(selectedAssIDs.begin(), selectedAssIDs.end(), assId_res.first) != selectedAssIDs.end()){ - char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); - result.append(buffer, (outpos - buffer - 1)); - result.push_back('\n'); - result5.append(std::to_string(assId_res.first) + "\t" +qcomplexIdToName[qComplexId] + "\t" + tcomplexIdToName[tComplexId] + "\t" + std::to_string(assId_res.second.qCov) + "\t" + std::to_string(assId_res.second.tCov) + "\t"+ std::to_string(assId_res.second.qTM)+"\t"+ std::to_string(assId_res.second.tTM)+ "\n"); - } + for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++){ + unsigned int assId = selectedAssIDs[assIdidx]; + unsigned int tComplexId = tChainKeyToComplexIdMap.at(localComplexMap.at(assId).dbKey); + char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); + result.append(buffer, (outpos - buffer - 1)); + result.push_back('\n'); + result5.append(qcomplexIdToName.at(qComplexId) + "\t" + tcomplexIdToName.at(tComplexId) + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); + + } resultWriter.writeData(result.c_str(), result.length(), qComplexId); - resultWrite5.writeData(result5.c_str(), result5.length(), 0); + + localComplexMap.clear(); + selectedAssIDs.clear(); + cmplIdToBestAssId.clear(); } } + resultWrite5.writeData(result5.c_str(), result5.length(), 0); resultWriter.close(true); resultWrite5.close(true); alnDbr.close(); @@ -306,5 +455,18 @@ int filtercomplex(int argc, const char **argv, const Command &command) { if (sameDB == false) { delete tDbr; } + + result5.clear(); + qChainKeyToComplexIdMap.clear(); + tChainKeyToComplexIdMap.clear(); + qComplexIdToChainKeyMap.clear(); + tComplexIdToChainKeyMap.clear(); + qcomplexIdToName.clear(); + tcomplexIdToName.clear(); + qComplexIdVec.clear(); + tComplexIdVec.clear(); + qComplexLength.clear(); + tComplexLength.clear(); + return EXIT_SUCCESS; } From 79ad721c1e50268dee7b432408782709c5d66c61 Mon Sep 17 00:00:00 2001 From: rachelse Date: Fri, 29 Mar 2024 14:40:26 +0900 Subject: [PATCH 069/160] Solved weird chain TM-score behavior --- src/commons/LocalParameters.cpp | 10 +- src/commons/LocalParameters.h | 6 +- src/strucclustutils/filtercomplex.cpp | 148 +++++++++++++++++--------- src/workflow/ComplexCluster.cpp | 2 +- 4 files changed, 105 insertions(+), 61 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 3713403c..48c9890e 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -31,8 +31,8 @@ LocalParameters::LocalParameters() : PARAM_EXPAND_COMPLEX_EVALUE(PARAM_EXPAND_COMPLEX_EVALUE_ID, "--expand-complex-evalue", "E-value threshold for expandcomplex", "E-value threshold for expandcomplex (range 0.0-inf)", typeid(double), (void *) &eValueThrExpandComplex, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN), PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"), PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), - PARAM_FILT_TM_THRESHOLD(PARAM_FILT_TM_THRESHOLD_ID,"--filter-tm-threshold", "TMscore threshold for filtercomplex", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), - PARAM_FILT_CHAIN_TM_THRESHOLD(PARAM_FILT_CHAIN_TM_THRESHOLD_ID,"--min-chain-tm", "per chain TMscore threshold for filtercomplex", "accept alignments satisfying tmscores of all chains > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$") + PARAM_COMPLEX_TM_THRESHOLD(PARAM_COMPLEX_TM_THRESHOLD_ID,"--complex-tm-threshold", "TMscore threshold for filtercomplex", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtComplexTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), + PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "per chain TMscore threshold for filtercomplex", "accept alignments satisfying tmscores of all chains > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$") { PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id"; @@ -186,8 +186,8 @@ LocalParameters::LocalParameters() : filtercomplex.push_back(&PARAM_THREADS); filtercomplex.push_back(&PARAM_C); filtercomplex.push_back(&PARAM_COV_MODE); - filtercomplex.push_back(&PARAM_FILT_TM_THRESHOLD); - filtercomplex.push_back(&PARAM_FILT_CHAIN_TM_THRESHOLD); + filtercomplex.push_back(&PARAM_COMPLEX_TM_THRESHOLD); + filtercomplex.push_back(&PARAM_CHAIN_TM_THRESHOLD); // createcomplexreport createcomplexreport.push_back(&PARAM_DB_OUTPUT); @@ -249,7 +249,7 @@ LocalParameters::LocalParameters() : complexReportMode = 1; eValueThrExpandComplex = 10000.0; citations.emplace(CITATION_FOLDSEEK, "van Kempen, M., Kim, S.S., Tumescheit, C., Mirdita, M., Lee, J., Gilchrist, C.L.M., Söding, J., and Steinegger, M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)"); - filtTmThr = 0.0; + filtComplexTmThr = 0.0; filtChainTmThr = 0.0; //rewrite param vals. diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 2fe646ed..ffe8bed9 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -125,8 +125,8 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_EXPAND_COMPLEX_EVALUE) PARAMETER(PARAM_INPUT_FORMAT) PARAMETER(PARAM_PDB_OUTPUT_MODE) - PARAMETER(PARAM_FILT_TM_THRESHOLD) - PARAMETER(PARAM_FILT_CHAIN_TM_THRESHOLD) + PARAMETER(PARAM_COMPLEX_TM_THRESHOLD) + PARAMETER(PARAM_CHAIN_TM_THRESHOLD) int prefMode; float tmScoreThr; @@ -150,7 +150,7 @@ class LocalParameters : public Parameters { double eValueThrExpandComplex; int inputFormat; int pdbOutputMode; - float filtTmThr; + float filtComplexTmThr; float filtChainTmThr; static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 92395e1d..e75201d5 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -54,15 +54,62 @@ static bool hasTM(float TMThr, int covMode, double qTM, double tTM){ } } +bool hasChainTm(float chainTMThr, int covMode, std::vector &qChainTmScores, std::vector &tChainTmScores) { + for (size_t i = 0; i < qChainTmScores.size(); i++) { + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + if (qChainTmScores[i] < chainTMThr || tChainTmScores[i] < chainTMThr) { + return false; + } + break; + case Parameters::COV_MODE_TARGET: + if (tChainTmScores[i] < chainTMThr) { + return false; + } + break; + case Parameters::COV_MODE_QUERY: + if (qChainTmScores[i] < chainTMThr) { + return false; + } + break; + case Parameters::COV_MODE_LENGTH_QUERY : + case Parameters::COV_MODE_LENGTH_TARGET : + case Parameters::COV_MODE_LENGTH_SHORTER : + break; + } + } + return true; +} + struct ComplexFilterCriteria { ComplexFilterCriteria() {} - ComplexFilterCriteria(unsigned int dbKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qTM, double tTM) : - dbKey(dbKey), qTotalAlnLen(qTotalAlnLen), tTotalAlnLen(tTotalAlnLen), qTM(qTM), tTM(tTM) {} + ComplexFilterCriteria(unsigned int dbKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qTM, double tTM, double qChainTm, double tChainTm) : + dbKey(dbKey), qTotalAlnLen(qTotalAlnLen), tTotalAlnLen(tTotalAlnLen), qTM(qTM), tTM(tTM) { + alignedQChainTmScores.push_back(qChainTm); + alignedTChainTmScores.push_back(tChainTm); + } + ~ComplexFilterCriteria() { + alignedQChainTmScores.clear(); + alignedTChainTmScores.clear(); + } - bool satisfyFilterCriteria(int covMode, float covThr, float TMThr) { + bool satisfy(int covMode, float covThr, float TMThr, float chainTMThr) { const bool covOK = Util::hasCoverage(covThr, covMode, qCov, tCov); const bool TMOK = hasTM(TMThr, covMode, qTM, tTM); - return (covOK && TMOK); + const bool chainTMOK = hasChainTm(chainTMThr, covMode, alignedQChainTmScores, alignedTChainTmScores); + return (covOK && TMOK && chainTMOK); + } + + void update(unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qChainTm, double tChainTm) { + this->qTotalAlnLen += qTotalAlnLen; + this->tTotalAlnLen += tTotalAlnLen; + this->alignedQChainTmScores.push_back(qChainTm); + this->alignedTChainTmScores.push_back(tChainTm); + } + + void calcCov(unsigned int qLen, unsigned int tLen) { + qCov = static_cast(qTotalAlnLen) / static_cast(qLen); + tCov = static_cast(tTotalAlnLen) / static_cast(tLen); } unsigned int dbKey; @@ -73,23 +120,24 @@ struct ComplexFilterCriteria { double qTM; double tTM; - std::vector alignedQChainKeys; //TODO - std::vector alignedTChainKeys; //TODO - std::vector alignedQChainTmScores; //TODO - std::vector alignedTChainTmScores; //TODO + std::vector alignedQChainTmScores; + std::vector alignedTChainTmScores; }; void fillUArr(const std::string &uString, float (&u)[3][3]) { std::string tmp; int i = 0; int j=0; - for (auto c : uString) { - if (c == ',') { + const int ulen = static_cast(uString.size()); + for (int k=0; k < ulen; k++) { + if (k==ulen-1) { + u[i][j] = std::stof(tmp); + } else if (uString[k] == ',') { u[i][j] = std::stof(tmp); tmp.clear(); j++; } else { - tmp.push_back(c); + tmp.push_back(uString[k]); } if (j == 3) { i++; @@ -101,13 +149,16 @@ void fillUArr(const std::string &uString, float (&u)[3][3]) { void fillTArr(const std::string &tString, float (&t)[3]) { std::string tmp; int i = 0; - for (auto c : tString) { - if (c == ',') { + const int tlen = static_cast(tString.size()); + for (int k=0; k localComplexMap; unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; @@ -357,7 +416,6 @@ int filtercomplex(int argc, const char **argv, const Command &command) { unsigned int assId = retComplex.assId; unsigned int tChainDbKey = res.dbKey; - // DOING float u[3][3]; float t[3]; Coordinates qm(0), tm(0); @@ -368,45 +426,33 @@ int filtercomplex(int argc, const char **argv, const Command &command) { char *tcadata = tStructDbr->getData(tChainDbKey, thread_idx); size_t tCaLength = tStructDbr->getEntryLen(tChainDbKey); float* tdata = tcoords.read(tcadata, tChainLen, tCaLength); - unsigned int normlen = std::max(res.qLen, res.dbLen); + unsigned int normlen = std::min(res.qLen, res.dbLen); unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); double chainTm = computeChainTmScore(qm, tm, t, u, match_len, normlen); - double qtm = chainTm * normlen / qChainLen; - double ttm = chainTm * normlen / tChainLen; + double qChainTm = chainTm / qChainLen; + double tChainTm = chainTm / tChainLen; + unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); if (localComplexMap.find(assId) == localComplexMap.end()) { - localComplexMap[assId] = ComplexFilterCriteria(); - localComplexMap[assId].dbKey = res.dbKey; - localComplexMap[assId].qTotalAlnLen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - localComplexMap[assId].tTotalAlnLen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - localComplexMap[assId].qTM = retComplex.qTmScore; - localComplexMap[assId].tTM = retComplex.tTmScore; - localComplexMap[assId].alignedQChainKeys.push_back(qChainDbKey); - localComplexMap[assId].alignedTChainKeys.push_back(tChainDbKey); - localComplexMap[assId].alignedQChainTmScores.push_back(qtm); - localComplexMap[assId].alignedTChainTmScores.push_back(ttm); + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(res.dbKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm); + localComplexMap[assId] = cmplfiltcrit; } else { - localComplexMap[assId].qTotalAlnLen += (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - localComplexMap[assId].tTotalAlnLen += (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - localComplexMap[assId].alignedQChainKeys.push_back(qChainDbKey); - localComplexMap[assId].alignedTChainKeys.push_back(res.dbKey); - localComplexMap[assId].alignedQChainTmScores.push_back(qtm); - localComplexMap[assId].alignedTChainTmScores.push_back(ttm); + localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm); } } } std::string result; - // std::string result5; std::vector assIdsToDelete; for (auto& assId_res : localComplexMap){ unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); - assId_res.second.qCov = static_cast(assId_res.second.qTotalAlnLen) / static_cast(qComplexLength.at(qComplexId)); - assId_res.second.tCov = static_cast(assId_res.second.tTotalAlnLen) / static_cast(tComplexLength.at(tComplexId)); - if (!assId_res.second.satisfyFilterCriteria(par.covMode, par.covThr, par.filtTmThr)){ + assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); + if (!assId_res.second.satisfy(par.covMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr)){ assIdsToDelete.push_back(assId_res.first); } } + for (const auto& key : assIdsToDelete) { localComplexMap.erase(key); } @@ -418,7 +464,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()){ cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } - else{ + else { if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]){ cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } @@ -437,8 +483,6 @@ int filtercomplex(int argc, const char **argv, const Command &command) { result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); result5.append(qcomplexIdToName.at(qComplexId) + "\t" + tcomplexIdToName.at(tComplexId) + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); - - } resultWriter.writeData(result.c_str(), result.length(), qComplexId); diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index 191225c5..79ef80b4 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -10,7 +10,7 @@ void setComplexClusterDefaults(LocalParameters *p) { p->covThr = 0.8; - p->filtTmThr = 0.5; // FIX + p->filtComplexTmThr = 0.5; // FIX p->filtChainTmThr=0.0; // FIX p->covMode = 1; p->clusteringMode = Parameters::GREEDY; From 77936ab3d18f53e5db5ddbf834c60b6b4ddf3b92 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 11 Apr 2024 06:25:04 +0900 Subject: [PATCH 070/160] handling monomer & calculate chainTM if complexTM satisfied --- src/strucclustutils/filtercomplex.cpp | 76 +++++++++++++++++---------- 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index e75201d5..ef4fadcb 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -93,11 +93,10 @@ struct ComplexFilterCriteria { alignedTChainTmScores.clear(); } - bool satisfy(int covMode, float covThr, float TMThr, float chainTMThr) { + bool satisfy(int covMode, float covThr, float chainTMThr) { const bool covOK = Util::hasCoverage(covThr, covMode, qCov, tCov); - const bool TMOK = hasTM(TMThr, covMode, qTM, tTM); const bool chainTMOK = hasChainTm(chainTMThr, covMode, alignedQChainTmScores, alignedTChainTmScores); - return (covOK && TMOK && chainTMOK); + return (covOK && chainTMOK); } void update(unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qChainTm, double tChainTm) { @@ -326,8 +325,9 @@ int filtercomplex(int argc, const char **argv, const Command &command) { alnDbr.open(DBReader::LINEAR_ACCCESS); size_t localThreads = 1; + #ifdef OPENMP - //localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); +//localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); #endif const bool shouldCompress = (par.compressed == true); const int db4Type = Parameters::DBTYPE_CLUSTER_RES; @@ -389,19 +389,26 @@ int filtercomplex(int argc, const char **argv, const Command &command) { Coordinate16 qcoords; Coordinate16 tcoords; - + std::string result; + std::vector discardAssIdvec; for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { unsigned int qChainKey = qChainKeys[qChainIdx]; unsigned int qChainDbKey = alnDbr.getId(qChainKey); + if (qChainDbKey == NOT_AVAILABLE_CHAIN_KEY){ + Debug(Debug::ERROR) << "Monomer chain key: "<sequenceReader->getSeqLen(qChainDbKey); char *qcadata = qStructDbr.getData(qChainDbKey, thread_idx); size_t qCaLength = qStructDbr.getEntryLen(qChainDbKey); float* qdata = qcoords.read(qcadata, qChainLen, qCaLength); - if (qChainDbKey == NOT_AVAILABLE_CHAIN_KEY) { - continue; - } char *data = alnDbr.getData(qChainDbKey, thread_idx); while (*data) { @@ -414,41 +421,52 @@ int filtercomplex(int argc, const char **argv, const Command &command) { data = Util::skipLine(data); unsigned int assId = retComplex.assId; - unsigned int tChainDbKey = res.dbKey; + unsigned int tChainDbKey = res.dbKey; + unsigned int tChainKey = tDbr->sequenceReader->getDbKey(tChainDbKey); + tChainDbKey = alnDbr.getId(tChainKey); - float u[3][3]; - float t[3]; - Coordinates qm(0), tm(0); - fillUArr(retComplex.uString, u); - fillTArr(retComplex.tString, t); + if (tChainDbKey == NOT_AVAILABLE_CHAIN_KEY){ + // Debug(Debug::ERROR) << "tChainKey"<sequenceReader->getSeqLen(tChainDbKey); char *tcadata = tStructDbr->getData(tChainDbKey, thread_idx); size_t tCaLength = tStructDbr->getEntryLen(tChainDbKey); float* tdata = tcoords.read(tcadata, tChainLen, tCaLength); unsigned int normlen = std::min(res.qLen, res.dbLen); - unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); - double chainTm = computeChainTmScore(qm, tm, t, u, match_len, normlen); - double qChainTm = chainTm / qChainLen; - double tChainTm = chainTm / tChainLen; - unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - - if (localComplexMap.find(assId) == localComplexMap.end()) { - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(res.dbKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm); - localComplexMap[assId] = cmplfiltcrit; - } else { - localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm); - } + + + if (hasTM(par.filtComplexTmThr, par.covMode, retComplex.qTmScore, retComplex.tTmScore)){ + unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + + float u[3][3]; + float t[3]; + Coordinates qm(0), tm(0); + fillUArr(retComplex.uString, u); + fillTArr(retComplex.tString, t); + unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); + double chainTm = computeChainTmScore(qm, tm, t, u, match_len, normlen); + double qChainTm = chainTm / qChainLen; + double tChainTm = chainTm / tChainLen; + + if (localComplexMap.find(assId) == localComplexMap.end()) { + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(res.dbKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm); + localComplexMap[assId] = cmplfiltcrit; + } else { + localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm); + } + } + } } - std::string result; std::vector assIdsToDelete; for (auto& assId_res : localComplexMap){ unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); - if (!assId_res.second.satisfy(par.covMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr)){ + if (!assId_res.second.satisfy(par.covMode, par.covThr, par.filtChainTmThr)){ assIdsToDelete.push_back(assId_res.first); } } From 36490d181f729ab21caf33569524fee2fc8a99f1 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 11 Apr 2024 06:49:38 +0900 Subject: [PATCH 071/160] minor change --- src/strucclustutils/filtercomplex.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index ef4fadcb..be166e29 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -438,18 +438,18 @@ int filtercomplex(int argc, const char **argv, const Command &command) { if (hasTM(par.filtComplexTmThr, par.covMode, retComplex.qTmScore, retComplex.tTmScore)){ - unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - float u[3][3]; float t[3]; Coordinates qm(0), tm(0); fillUArr(retComplex.uString, u); fillTArr(retComplex.tString, t); + unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); double chainTm = computeChainTmScore(qm, tm, t, u, match_len, normlen); double qChainTm = chainTm / qChainLen; double tChainTm = chainTm / tChainLen; + unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); if (localComplexMap.find(assId) == localComplexMap.end()) { ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(res.dbKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm); From f3a9c22b89bdb1ea7a1642779ef3fcefba54a3fc Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 11 Apr 2024 07:24:39 +0900 Subject: [PATCH 072/160] minor --- src/strucclustutils/filtercomplex.cpp | 85 +++++++++++++++------------ 1 file changed, 48 insertions(+), 37 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index be166e29..cc6d4197 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -55,27 +55,29 @@ static bool hasTM(float TMThr, int covMode, double qTM, double tTM){ } bool hasChainTm(float chainTMThr, int covMode, std::vector &qChainTmScores, std::vector &tChainTmScores) { - for (size_t i = 0; i < qChainTmScores.size(); i++) { - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - if (qChainTmScores[i] < chainTMThr || tChainTmScores[i] < chainTMThr) { - return false; - } - break; - case Parameters::COV_MODE_TARGET: - if (tChainTmScores[i] < chainTMThr) { - return false; - } - break; - case Parameters::COV_MODE_QUERY: - if (qChainTmScores[i] < chainTMThr) { - return false; - } - break; - case Parameters::COV_MODE_LENGTH_QUERY : - case Parameters::COV_MODE_LENGTH_TARGET : - case Parameters::COV_MODE_LENGTH_SHORTER : - break; + if (chainTMThr > 0 ){ + for (size_t i = 0; i < qChainTmScores.size(); i++) { + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + if (qChainTmScores[i] < chainTMThr || tChainTmScores[i] < chainTMThr) { + return false; + } + break; + case Parameters::COV_MODE_TARGET: + if (tChainTmScores[i] < chainTMThr) { + return false; + } + break; + case Parameters::COV_MODE_QUERY: + if (qChainTmScores[i] < chainTMThr) { + return false; + } + break; + case Parameters::COV_MODE_LENGTH_QUERY : + case Parameters::COV_MODE_LENGTH_TARGET : + case Parameters::COV_MODE_LENGTH_SHORTER : + break; + } } } return true; @@ -438,26 +440,35 @@ int filtercomplex(int argc, const char **argv, const Command &command) { if (hasTM(par.filtComplexTmThr, par.covMode, retComplex.qTmScore, retComplex.tTmScore)){ - float u[3][3]; - float t[3]; - Coordinates qm(0), tm(0); - fillUArr(retComplex.uString, u); - fillTArr(retComplex.tString, t); - - unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); - double chainTm = computeChainTmScore(qm, tm, t, u, match_len, normlen); - double qChainTm = chainTm / qChainLen; - double tChainTm = chainTm / tChainLen; unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - - if (localComplexMap.find(assId) == localComplexMap.end()) { - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(res.dbKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm); - localComplexMap[assId] = cmplfiltcrit; - } else { - localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm); + if (par.filtChainTmThr > 0 ){ + float u[3][3]; + float t[3]; + Coordinates qm(0), tm(0); + fillUArr(retComplex.uString, u); + fillTArr(retComplex.tString, t); + unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); + double chainTm = computeChainTmScore(qm, tm, t, u, match_len, normlen); + double qChainTm = chainTm / qChainLen; + double tChainTm = chainTm / tChainLen; + + if (localComplexMap.find(assId) == localComplexMap.end()) { + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(res.dbKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm); + localComplexMap[assId] = cmplfiltcrit; + } else { + localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm); + } } + else{ + if (localComplexMap.find(assId) == localComplexMap.end()) { + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(res.dbKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, 1, 1); + localComplexMap[assId] = cmplfiltcrit; + } else { + localComplexMap.at(assId).update(qtotalaln, ttotalaln, 1, 1); + } } + } } } From 0bc9d97d139a699a6eb0e323ecdeb723f940dc63 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 11 Apr 2024 07:25:31 +0900 Subject: [PATCH 073/160] minor --- src/strucclustutils/filtercomplex.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index cc6d4197..4d597393 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -384,6 +384,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { qComplexLength[qComplexId] = reslen; } + Debug(Debug::ERROR) << "Monomer will be treated as singleton\nMonomer chain key: \n"; for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { std::map localComplexMap; unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; @@ -398,8 +399,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { unsigned int qChainDbKey = alnDbr.getId(qChainKey); if (qChainDbKey == NOT_AVAILABLE_CHAIN_KEY){ - Debug(Debug::ERROR) << "Monomer chain key: "< Date: Fri, 12 Apr 2024 17:57:31 +0900 Subject: [PATCH 074/160] make filtcov.tsv not db --- src/strucclustutils/filtercomplex.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 4d597393..ca492168 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -522,7 +522,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { } resultWrite5.writeData(result5.c_str(), result5.length(), 0); resultWriter.close(true); - resultWrite5.close(true); + resultWrite5.close(par.dbOut == false); alnDbr.close(); delete qDbr; if (sameDB == false) { From 961b8cf0c3623c483c15fe42c9dc3728b462ffa6 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 15 Apr 2024 22:17:37 +0900 Subject: [PATCH 075/160] removing extension --- src/strucclustutils/structcreatedb.cpp | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/strucclustutils/structcreatedb.cpp b/src/strucclustutils/structcreatedb.cpp index a198bff2..20a70935 100644 --- a/src/strucclustutils/structcreatedb.cpp +++ b/src/strucclustutils/structcreatedb.cpp @@ -74,6 +74,18 @@ static inline bool compareByFirst(const std::pair& a, const std::pair & alphabet3di, std::vector & alphabetAA, @@ -146,7 +158,7 @@ writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, Stru torsiondbw.writeData(alphabet3di.data(), alphabet3di.size(), dbKey, thread_idx); aadbw.writeData(alphabetAA.data(), alphabetAA.size(), dbKey, thread_idx); header.clear(); - header.append(readStructure.names[ch]); + header.append(Util::remove_extension(readStructure.names[ch])); if(readStructure.modelCount > 1){ header.append("_MODEL_"); header.append(std::to_string(readStructure.modelIndices[ch])); @@ -164,14 +176,14 @@ writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, Stru std::string entryName = Util::parseFastaHeader(header.c_str()); #pragma omp critical { - std::map::iterator it = filenameToFileId.find(filename); + std::map::iterator it = filenameToFileId.find(Util::remove_extension(filename)); size_t fileid; if (it != filenameToFileId.end()) { fileid = it->second; } else { fileid = fileidCnt; - filenameToFileId[filename] = fileid; - fileIdToName[fileid] = filename; + filenameToFileId[Util::remove_extension(filename)] = fileid; + fileIdToName[fileid] = Util::remove_extension(filename); fileidCnt++; } entrynameToFileId[entryName] = std::make_pair(fileid, readStructure.modelIndices[ch]); @@ -853,8 +865,9 @@ int structcreatedb(int argc, const char **argv, const Command& command) { for (unsigned int id = 0; id < readerHeader.getSize(); id++) { char *header = readerHeader.getData(id, 0); entry.id = readerHeader.getDbKey(id); - entry.entryName = Util::parseFastaHeader(header); - std::pair fileIdModelEntry = entrynameToFileId[entry.entryName]; + std::string entryNameWithModel = Util::parseFastaHeader(header); + entry.entryName = removeModel(entryNameWithModel); + std::pair fileIdModelEntry = entrynameToFileId[entryNameWithModel]; size_t fileId = fileIdModelEntry.first; if(modelFileIdLookup.find(fileIdModelEntry) == modelFileIdLookup.end()){ modelFileIdLookup[fileIdModelEntry] = globalFileNumber; From d02373d8c4523874680a791712775a0a4155f0ce Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 16 Apr 2024 12:04:00 +0900 Subject: [PATCH 076/160] parsing --- data/easycomplexcluster.sh | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index eea901fa..28165015 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -28,16 +28,20 @@ abspath() { mapCmplName2ChainKeys() { awk -F"\t" ' - # BEGIN { - # split(repComp,repCompArr," "); - # for (i in repCompArr) {repCompArr[repCompArr[i]]=""} - # } NR==FNR { - repName_memName[$1]=$2;next + split($2,repArr,"_MODEL") + repName_memName[repArr[1]]=1;next } { - split($2,cmpNameArr,".pdb"); cmpl=cmpNameArr[1]".pdb" - if (cmpl in repName_memName) { + split($2,parts,"_") + output_string="" + for (j = 1; j < length(parts); j++) { + output_string = output_string parts[j] + if (j < length(parts)-1){ + output_string=output_string"_" + } + } + if (output_string in repName_memName) { print $1 } } @@ -89,7 +93,7 @@ if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then # shellcheck disable=SC2086 "$MMSEQS" result2flat "${SOURCE}" "${SOURCE}" "${TMP_PATH}/complex_rep_seqs" "${TMP_PATH}/complex_rep_seq.fasta" ${VERBOSITY_PAR} \ || fail "result2flat died" - postprocessFasta "${TMP_PATH}/complex_rep_seq.fasta" + # postprocessFasta "${TMP_PATH}/complex_rep_seq.fasta" fi #TODO: generate fasta file for all sequences From 6f3ac2b6c79433a595be239f471bc1fe07d401c3 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 16 Apr 2024 15:26:48 +0900 Subject: [PATCH 077/160] parsing with pdb --- data/easycomplexcluster.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index 28165015..e2cb9eac 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -51,8 +51,15 @@ mapCmplName2ChainKeys() { postprocessFasta() { awk ' BEGIN {FS=">"} $0 ~/^>/ { - match($2, /(.*).pdb*/) - complex = substr($2, RSTART, RLENGTH-4) + # match($2, /(.*).pdb*/) + split($2,parts,"_") + complex="" + for (j = 1; j < length(parts); j++) { + complex = complex parts[j] + if (j < length(parts)-1){ + complex=complex"_" + } + } if (!(complex in repComplex)) { print "#"complex".pdb" repComplex[complex] = "" From 55b5338c57dbbab291e46d4c57d19ace39059eed Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 16 Apr 2024 15:47:41 +0900 Subject: [PATCH 078/160] memcpy error solve --- lib/tmalign/Coordinates.h | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/tmalign/Coordinates.h b/lib/tmalign/Coordinates.h index 90e168d8..b0f7dcec 100644 --- a/lib/tmalign/Coordinates.h +++ b/lib/tmalign/Coordinates.h @@ -5,6 +5,7 @@ #ifndef STRUCCLUST_COORDINATES_H #define STRUCCLUST_COORDINATES_H #include "simd.h" +#include struct Coordinates{ Coordinates(int size) : size(size) { From cf28e0764116b9a583101aa8a90273255fd98a08 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 16 Apr 2024 17:13:30 +0900 Subject: [PATCH 079/160] parsing problem solved --- data/easycomplexcluster.sh | 53 +++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/data/easycomplexcluster.sh b/data/easycomplexcluster.sh index e2cb9eac..00e95db1 100644 --- a/data/easycomplexcluster.sh +++ b/data/easycomplexcluster.sh @@ -27,25 +27,36 @@ abspath() { } mapCmplName2ChainKeys() { - awk -F"\t" ' - NR==FNR { - split($2,repArr,"_MODEL") - repName_memName[repArr[1]]=1;next + awk -F"\t" 'FNR==1 {++fIndex} + fIndex==1 { + repName[$1]=1 + if (match($1, /MODEL/)){ + tmpName[$1]=1 + }else{ + tmpName[$1"_MODEL_1"]=1 + } + next } - { - split($2,parts,"_") - output_string="" - for (j = 1; j < length(parts); j++) { - output_string = output_string parts[j] - if (j < length(parts)-1){ - output_string=output_string"_" + fIndex==2{ + if (match($2, /MODEL/)){ + if ($2 in tmpName){ + repId[$1]=1 + }else{ + ho[1]=1 + } + }else{ + if ($2 in repName){ + repId[$1]=1 } } - if (output_string in repName_memName) { + next + } + { + if ($3 in repId){ print $1 } } - ' "${1}" "${2}.lookup" > "${3}" + ' "${1}" "${2}.source" "${2}.lookup" > "${3}" } postprocessFasta() { @@ -53,15 +64,15 @@ postprocessFasta() { $0 ~/^>/ { # match($2, /(.*).pdb*/) split($2,parts,"_") - complex="" - for (j = 1; j < length(parts); j++) { - complex = complex parts[j] - if (j < length(parts)-1){ - complex=complex"_" - } + complex="" + for (j = 1; j < length(parts); j++) { + complex = complex parts[j] + if (j < length(parts)-1){ + complex=complex"_" } + } if (!(complex in repComplex)) { - print "#"complex".pdb" + print "#"complex repComplex[complex] = "" } } @@ -100,7 +111,7 @@ if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then # shellcheck disable=SC2086 "$MMSEQS" result2flat "${SOURCE}" "${SOURCE}" "${TMP_PATH}/complex_rep_seqs" "${TMP_PATH}/complex_rep_seq.fasta" ${VERBOSITY_PAR} \ || fail "result2flat died" - # postprocessFasta "${TMP_PATH}/complex_rep_seq.fasta" + postprocessFasta "${TMP_PATH}/complex_rep_seq.fasta" fi #TODO: generate fasta file for all sequences From 8f2ab715519b57f0e368c1f885ca1e7af5dd6b0c Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 16 Apr 2024 17:54:24 +0900 Subject: [PATCH 080/160] simplify building complex header --- data/complexcluster.sh | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index 681db924..57aadd75 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -49,7 +49,8 @@ buildCmplDb() { } buldCmplhDb(){ - awk -F"\t" '{ + awk -F"\t" 'BEGIN {INDEXVAL=0} + { split($2,words," ") split(words[1],parts,"_") output_string="" @@ -59,20 +60,17 @@ buldCmplhDb(){ output_string=output_string"_" } } - for (t = 2; t < length(words); t++) { - output_string=output_string" "words[t] + headerstring="" + for (k = 2; k < length(words)+1; k++) { + headerstring = headerstring words[k]" " } - print output_string - }' "${1}" > "${2}_redundant" - - awk 'BEGIN {index_value=0} - { - if (!seen[$1]) { - print index_value++"\t"$0 - seen[$1] = 1 - } - }' "${2}_redundant" > "${2}" - + if (!(output_string not in gogo)){ + print INDEXVAL"\t"output_string" "headerstring + INDEXVAL++ + } + gogo[output_string]=1 + + }' "${1}" > "${2}" } @@ -102,9 +100,9 @@ if notExists "${TMP_PATH}/complex_db_h.dbtype"; then # "$MMSEQS" tsv2db "${INPUT}.source" "${TMP_PATH}/complex_db_header_tmp" ${VERBOSITY_PAR} \ # || fail "tsv2db died" # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h_tmp" ${VERBOSITY_PAR} \ + "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h.tsv" ${VERBOSITY_PAR} \ || fail "createtsv died" - buldCmplhDb "${TMP_PATH}/chain_db_h_tmp" "${TMP_PATH}/complex_header.tsv" + buldCmplhDb "${TMP_PATH}/chain_db_h.tsv" "${TMP_PATH}/complex_header.tsv" # shellcheck disable=SC2086 "$MMSEQS" tsv2db "${TMP_PATH}/complex_header.tsv" "${TMP_PATH}/complex_db_h" ${VERBOSITY_PAR} \ || fail "tsv2db died" From e333ad486519eacce371ec03b769655ea4f5da76 Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 18 Apr 2024 11:24:41 +0900 Subject: [PATCH 081/160] Made few comments reviewing filtercomplex.cpp --- src/strucclustutils/filtercomplex.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index ca492168..7aa2b8be 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -120,7 +120,7 @@ struct ComplexFilterCriteria { float tCov; double qTM; double tTM; - + //TODO : Instead of saving all the chain tm scores, only keeping the worst one? std::vector alignedQChainTmScores; std::vector alignedTChainTmScores; }; @@ -438,7 +438,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { float* tdata = tcoords.read(tcadata, tChainLen, tCaLength); unsigned int normlen = std::min(res.qLen, res.dbLen); - + // TODO: do not check the TM score here, be consistent with the other filters if (hasTM(par.filtComplexTmThr, par.covMode, retComplex.qTmScore, retComplex.tTmScore)){ unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); From a78bbd5d8e47d531b6cfa12518c9a165b5f9cd1f Mon Sep 17 00:00:00 2001 From: rachelse Date: Fri, 19 Apr 2024 18:34:59 +0900 Subject: [PATCH 082/160] Set default param as set4final when computing chaintmscore --- src/strucclustutils/filtercomplex.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 7aa2b8be..a2620173 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -221,13 +221,22 @@ double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u float d0; // float score_d8 = 1.5*pow(normlen,0.3)+3.5; - if (normlen<=19) { - d0=0.168; + // set4search + // if (normlen<=19) { + // d0=0.168; + // } + // else { + // d0=1.24*pow((normlen-15),1.0/3)-1.8; + // } + // d0 += 0.8; + + // set4final + if (normlen<=21) { + d0=0.5; } else { d0=1.24*pow((normlen-15),1.0/3)-1.8; } - d0 += 0.8; Coordinates tmt(mlen); BasicFunction::do_rotation(tm, tmt, mlen, t, u); From 0430e9e50a3d0a7a36d07b13967a10a2727b6a6e Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Fri, 19 Apr 2024 20:21:36 +0900 Subject: [PATCH 083/160] Calculate chain TM everytime --- src/strucclustutils/filtercomplex.cpp | 43 ++++++++++++++++++--------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index a2620173..3a722e80 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -95,10 +95,11 @@ struct ComplexFilterCriteria { alignedTChainTmScores.clear(); } - bool satisfy(int covMode, float covThr, float chainTMThr) { + bool satisfy(int covMode, float covThr, float TMThr, float chainTMThr) { const bool covOK = Util::hasCoverage(covThr, covMode, qCov, tCov); + const bool TMOK = hasTM(TMThr, covMode, qTM, tTM); const bool chainTMOK = hasChainTm(chainTMThr, covMode, alignedQChainTmScores, alignedTChainTmScores); - return (covOK && chainTMOK); + return (covOK && TMOK && chainTMOK); } void update(unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qChainTm, double tChainTm) { @@ -120,7 +121,7 @@ struct ComplexFilterCriteria { float tCov; double qTM; double tTM; - //TODO : Instead of saving all the chain tm scores, only keeping the worst one? + std::vector alignedQChainTmScores; std::vector alignedTChainTmScores; }; @@ -221,15 +222,9 @@ double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u float d0; // float score_d8 = 1.5*pow(normlen,0.3)+3.5; - // set4search // if (normlen<=19) { // d0=0.168; - // } - // else { - // d0=1.24*pow((normlen-15),1.0/3)-1.8; - // } - // d0 += 0.8; - +// } // set4final if (normlen<=21) { d0=0.5; @@ -237,6 +232,7 @@ double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u else { d0=1.24*pow((normlen-15),1.0/3)-1.8; } + // d0 += 0.8; Coordinates tmt(mlen); BasicFunction::do_rotation(tm, tmt, mlen, t, u); @@ -338,7 +334,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { #ifdef OPENMP -//localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); +// localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); #endif const bool shouldCompress = (par.compressed == true); const int db4Type = Parameters::DBTYPE_CLUSTER_RES; @@ -393,8 +389,9 @@ int filtercomplex(int argc, const char **argv, const Command &command) { qComplexLength[qComplexId] = reslen; } - Debug(Debug::ERROR) << "Monomer will be treated as singleton\nMonomer chain key: \n"; + Debug(Debug::WARNING) << "Monomer will be treated as singleton\nMonomer chain key: \n"; for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { + std::map tmpDBKEYut; std::map localComplexMap; unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); @@ -447,7 +444,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { float* tdata = tcoords.read(tcadata, tChainLen, tCaLength); unsigned int normlen = std::min(res.qLen, res.dbLen); - // TODO: do not check the TM score here, be consistent with the other filters + if (hasTM(par.filtComplexTmThr, par.covMode, retComplex.qTmScore, retComplex.tTmScore)){ unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); @@ -457,6 +454,7 @@ int filtercomplex(int argc, const char **argv, const Command &command) { Coordinates qm(0), tm(0); fillUArr(retComplex.uString, u); fillTArr(retComplex.tString, t); + tmpDBKEYut[assId]=retComplex.uString+","+retComplex.tString; unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); double chainTm = computeChainTmScore(qm, tm, t, u, match_len, normlen); double qChainTm = chainTm / qChainLen; @@ -486,9 +484,26 @@ int filtercomplex(int argc, const char **argv, const Command &command) { for (auto& assId_res : localComplexMap){ unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); - if (!assId_res.second.satisfy(par.covMode, par.covThr, par.filtChainTmThr)){ + if (!assId_res.second.satisfy(par.covMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr)){ assIdsToDelete.push_back(assId_res.first); } + else { + if (qComplexId != tComplexId){ + Debug(Debug::WARNING) << "q: "< Date: Fri, 19 Apr 2024 20:52:33 +0900 Subject: [PATCH 084/160] maybe solved chain TM --- src/strucclustutils/filtercomplex.cpp | 32 ++++++++++++++++++++------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 3a722e80..fcfd3093 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -121,7 +121,7 @@ struct ComplexFilterCriteria { float tCov; double qTM; double tTM; - +//TODO : Instead of saving all the chain tm scores, only keeping the worst one? std::vector alignedQChainTmScores; std::vector alignedTChainTmScores; }; @@ -230,16 +230,17 @@ double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u d0=0.5; } else { - d0=1.24*pow((normlen-15),1.0/3)-1.8; + d0=(1.24*pow((normlen*1.0-15), 1.0/3)-1.8); } // d0 += 0.8; Coordinates tmt(mlen); BasicFunction::do_rotation(tm, tmt, mlen, t, u); - + float d02 = d0*d0; // float score_d82 = score_d8*score_d8; for (unsigned int k=0; k Date: Sat, 20 Apr 2024 15:31:25 +0900 Subject: [PATCH 085/160] Revert "maybe solved chain TM" This reverts commit 27f9ac86b36f2c186a04bc44dd9163ba69638444. --- src/strucclustutils/filtercomplex.cpp | 32 +++++++-------------------- 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index fcfd3093..3a722e80 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -121,7 +121,7 @@ struct ComplexFilterCriteria { float tCov; double qTM; double tTM; -//TODO : Instead of saving all the chain tm scores, only keeping the worst one? + std::vector alignedQChainTmScores; std::vector alignedTChainTmScores; }; @@ -230,17 +230,16 @@ double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u d0=0.5; } else { - d0=(1.24*pow((normlen*1.0-15), 1.0/3)-1.8); + d0=1.24*pow((normlen-15),1.0/3)-1.8; } // d0 += 0.8; Coordinates tmt(mlen); BasicFunction::do_rotation(tm, tmt, mlen, t, u); - + float d02 = d0*d0; // float score_d82 = score_d8*score_d8; for (unsigned int k=0; k Date: Sat, 20 Apr 2024 21:30:05 +0900 Subject: [PATCH 086/160] [TODO] multithreading segfault --- src/commons/LocalParameters.cpp | 9 +- src/commons/LocalParameters.h | 2 + src/strucclustutils/filtercomplex.cpp | 150 +++++++++++++++----------- 3 files changed, 98 insertions(+), 63 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 48c9890e..3d6bb0ff 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -32,8 +32,9 @@ LocalParameters::LocalParameters() : PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"), PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), PARAM_COMPLEX_TM_THRESHOLD(PARAM_COMPLEX_TM_THRESHOLD_ID,"--complex-tm-threshold", "TMscore threshold for filtercomplex", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtComplexTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), - PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "per chain TMscore threshold for filtercomplex", "accept alignments satisfying tmscores of all chains > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$") - + PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "per chain TMscore threshold for filtercomplex", "accept alignments satisfying tmscores of all chains > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), + PARAM_SAME_CHAIN_NUM(PARAM_SAME_CHAIN_NUM_ID, "--same-chain-number", "Only Cluster complex with same chain number", "Cluster only same chain numbers(1) or not(0)", typeid(bool), (void *) &sameChainNumber, "") + { PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id"; PARAM_ALIGNMENT_MODE.regex = "^[0-3]{1}$"; @@ -188,6 +189,8 @@ LocalParameters::LocalParameters() : filtercomplex.push_back(&PARAM_COV_MODE); filtercomplex.push_back(&PARAM_COMPLEX_TM_THRESHOLD); filtercomplex.push_back(&PARAM_CHAIN_TM_THRESHOLD); + filtercomplex.push_back(&PARAM_SAME_CHAIN_NUM); + // createcomplexreport createcomplexreport.push_back(&PARAM_DB_OUTPUT); @@ -251,6 +254,8 @@ LocalParameters::LocalParameters() : citations.emplace(CITATION_FOLDSEEK, "van Kempen, M., Kim, S.S., Tumescheit, C., Mirdita, M., Lee, J., Gilchrist, C.L.M., Söding, J., and Steinegger, M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)"); filtComplexTmThr = 0.0; filtChainTmThr = 0.0; + sameChainNumber = 0; + //rewrite param vals. PARAM_FORMAT_OUTPUT.description = "Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,fident,nident,qstart,qend,qlen\ntstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,mismatch,qcov,tcov\nqset,qsetid,tset,tsetid,taxid,taxname,taxlineage,\nlddt,lddtfull,qca,tca,t,u,qtmscore,ttmscore,alntmscore,rmsd,prob\ncomplexqtmscore,complexttmscore,complexu,complext,complexassignid\n"; diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index ffe8bed9..15487e6f 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -127,6 +127,7 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_PDB_OUTPUT_MODE) PARAMETER(PARAM_COMPLEX_TM_THRESHOLD) PARAMETER(PARAM_CHAIN_TM_THRESHOLD) + PARAMETER(PARAM_SAME_CHAIN_NUM) int prefMode; float tmScoreThr; @@ -152,6 +153,7 @@ class LocalParameters : public Parameters { int pdbOutputMode; float filtComplexTmThr; float filtChainTmThr; + bool sameChainNumber; static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy, bool &needQCa, bool &needTCa, bool &needTMaligner, diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 3a722e80..b6f0e791 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -37,6 +37,16 @@ unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { } } +static bool hasChainnum(bool sameChainNum, int qChainNum, int tChainNum){ + switch (sameChainNum){ + case 1: + if (qChainNum != tChainNum){ + return false; + }else{return true;} + case 0: + return true; + } +} static bool hasTM(float TMThr, int covMode, double qTM, double tTM){ switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: @@ -95,11 +105,12 @@ struct ComplexFilterCriteria { alignedTChainTmScores.clear(); } - bool satisfy(int covMode, float covThr, float TMThr, float chainTMThr) { + bool satisfy(int covMode, float covThr, float TMThr, float chainTMThr, bool sameChainNum, int qChainNum, int tChainNum ) { const bool covOK = Util::hasCoverage(covThr, covMode, qCov, tCov); const bool TMOK = hasTM(TMThr, covMode, qTM, tTM); const bool chainTMOK = hasChainTm(chainTMThr, covMode, alignedQChainTmScores, alignedTChainTmScores); - return (covOK && TMOK && chainTMOK); + const bool numOK = hasChainnum(sameChainNum, qChainNum, tChainNum); + return (covOK && TMOK && chainTMOK && numOK); } void update(unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qChainTm, double tChainTm) { @@ -199,8 +210,8 @@ unsigned int fillMatchedCoord(float * qdata, float * tdata, ti++; } } - qm.realloc(mi); - tm.realloc(mi); + qm.reallocate(mi); + tm.reallocate(mi); std::copy(qx.begin(), qx.end(), qm.x); std::copy(qy.begin(), qy.end(), qm.y); std::copy(qz.begin(), qz.end(), qm.z); @@ -332,9 +343,9 @@ int filtercomplex(int argc, const char **argv, const Command &command) { alnDbr.open(DBReader::LINEAR_ACCCESS); size_t localThreads = 1; - + // Debug(Debug::WARNING) << "Monomer will be treated as singleton\nMonomer chain key: \n"; #ifdef OPENMP -// localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); +localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); #endif const bool shouldCompress = (par.compressed == true); const int db4Type = Parameters::DBTYPE_CLUSTER_RES; @@ -361,45 +372,46 @@ int filtercomplex(int argc, const char **argv, const Command &command) { Debug::Progress progress(qComplexIdVec.size()); std::map qComplexLength, tComplexLength; + for (size_t tComplexIdx = 0; tComplexIdx < tComplexIdVec.size(); tComplexIdx++) { + unsigned int tComplexId = tComplexIdVec[tComplexIdx]; + std::vector &tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); + if (tChainKeys.empty()) { + continue; + } + unsigned int reslen = getComplexResidueLength(tDbr, tChainKeys); + tComplexLength[tComplexId] =reslen; + } + for (size_t qComplexIdx = 0; qComplexIdx < qComplexIdVec.size(); qComplexIdx++) { + unsigned int qComplexId = qComplexIdVec[qComplexIdx]; + std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); + if (qChainKeys.empty()) { + continue; + } + unsigned int reslen = getComplexResidueLength(qDbr, qChainKeys); + qComplexLength[qComplexId] = reslen; + } + + + #pragma omp parallel num_threads(localThreads) - { + { std::string result; + std::map tmpDBKEYut; + std::map localComplexMap; + std::vector assIdsToDelete; + std::map> cmplIdToBestAssId; // cmplId : [assId, alnSum] + std::vector selectedAssIDs; unsigned int thread_idx = 0; #ifdef OPENMP thread_idx = static_cast(omp_get_thread_num()); #endif - Matcher::result_t res; -#pragma omp for schedule(dynamic, 10) nowait - - for (size_t tComplexIdx = 0; tComplexIdx < tComplexIdVec.size(); tComplexIdx++) { - unsigned int tComplexId = tComplexIdVec[tComplexIdx]; - std::vector &tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); - if (tChainKeys.empty()) { - continue; - } - unsigned int reslen = getComplexResidueLength(tDbr, tChainKeys); - tComplexLength[tComplexId] =reslen; - } - for (size_t qComplexIdx = 0; qComplexIdx < qComplexIdVec.size(); qComplexIdx++) { - unsigned int qComplexId = qComplexIdVec[qComplexIdx]; - std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); - if (qChainKeys.empty()) { - continue; - } - unsigned int reslen = getComplexResidueLength(qDbr, qChainKeys); - qComplexLength[qComplexId] = reslen; - } - - Debug(Debug::WARNING) << "Monomer will be treated as singleton\nMonomer chain key: \n"; +#pragma omp for schedule(dynamic, 10) for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { - std::map tmpDBKEYut; - std::map localComplexMap; unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); Coordinate16 qcoords; Coordinate16 tcoords; - std::string result; - std::vector discardAssIdvec; + for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { unsigned int qChainKey = qChainKeys[qChainIdx]; unsigned int qChainDbKey = alnDbr.getId(qChainKey); @@ -477,40 +489,53 @@ int filtercomplex(int argc, const char **argv, const Command &command) { } } - } + } // while end } - std::vector assIdsToDelete; - for (auto& assId_res : localComplexMap){ unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); - if (!assId_res.second.satisfy(par.covMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr)){ + std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); + if (!assId_res.second.satisfy(par.covMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr, par.sameChainNumber,qChainKeys.size(), tChainKeys.size())){ assIdsToDelete.push_back(assId_res.first); + // if (qComplexId != tComplexId){ + // Debug(Debug::WARNING) << "BAD: q: "<> cmplIdToBestAssId; // cmplId : [assId, alnSum] for (const auto& assId_res : localComplexMap){ unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); unsigned int alnlen = adjustAlnLen(assId_res.second.qTotalAlnLen, assId_res.second.tTotalAlnLen, par.covMode); @@ -524,7 +549,6 @@ int filtercomplex(int argc, const char **argv, const Command &command) { } } - std::vector selectedAssIDs; for (const auto& pair : cmplIdToBestAssId){ selectedAssIDs.push_back(pair.second[0]); } @@ -538,12 +562,16 @@ int filtercomplex(int argc, const char **argv, const Command &command) { result5.append(qcomplexIdToName.at(qComplexId) + "\t" + tcomplexIdToName.at(tComplexId) + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); } resultWriter.writeData(result.c_str(), result.length(), qComplexId); - + result.clear(); localComplexMap.clear(); - selectedAssIDs.clear(); + tmpDBKEYut.clear(); + assIdsToDelete.clear(); cmplIdToBestAssId.clear(); - } - } + selectedAssIDs.clear(); + + } // for end + } // MP end + resultWrite5.writeData(result5.c_str(), result5.length(), 0); resultWriter.close(true); resultWrite5.close(par.dbOut == false); From 09b4e410c44cb942593cb368ac09812061061881 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Sun, 21 Apr 2024 00:44:33 +0900 Subject: [PATCH 087/160] Solved Multithreading --- src/strucclustutils/filtercomplex.cpp | 61 +++++++++++++++++---------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index b6f0e791..634fb8fc 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -6,7 +6,6 @@ #include "DBReader.h" #include "IndexReader.h" #include "FileUtil.h" -#include "TranslateNucl.h" #include "MemoryMapped.h" #include "Coordinate16.h" #include "tmalign/basic_fun.h" @@ -37,7 +36,7 @@ unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { } } -static bool hasChainnum(bool sameChainNum, int qChainNum, int tChainNum){ +bool hasChainnum(bool sameChainNum, int qChainNum, int tChainNum){ switch (sameChainNum){ case 1: if (qChainNum != tChainNum){ @@ -47,7 +46,8 @@ static bool hasChainnum(bool sameChainNum, int qChainNum, int tChainNum){ return true; } } -static bool hasTM(float TMThr, int covMode, double qTM, double tTM){ + +bool hasTM(float TMThr, int covMode, double qTM, double tTM){ switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: return ((qTM>= TMThr) && (tTM >= TMThr)); @@ -319,15 +319,14 @@ int filtercomplex(int argc, const char **argv, const Command &command) { const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); int dbaccessMode = (DBReader::USE_INDEX); - char buffer[32]; - IndexReader* qDbr; + IndexReader* qDbr = NULL; qDbr = new IndexReader(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); DBReader qStructDbr((par.db1 + "_ca").c_str(), (par.db1 + "_ca.index").c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); qStructDbr.open(DBReader::NOSORT); - IndexReader* tDbr; + IndexReader* tDbr = NULL; DBReader *tStructDbr = NULL; if (sameDB) { tDbr = qDbr; @@ -356,12 +355,10 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t const int db5Type = Parameters::DBTYPE_GENERIC_DB; DBWriter resultWrite5(par.db5.c_str(), par.db5Index.c_str(), 1, shouldCompress, db5Type); resultWrite5.open(); - resultToWrite_t result5; std::string qLookupFile = par.db1 + ".lookup"; std::string tLookupFile = par.db2 + ".lookup"; - TranslateNucl translateNucl(static_cast(par.translationTable)); - Matcher::result_t res; + chainKeyToComplexId_t qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; complexIdToChainKeys_t qComplexIdToChainKeyMap, tComplexIdToChainKeyMap; std::map qcomplexIdToName, tcomplexIdToName; @@ -371,6 +368,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t qChainKeyToComplexIdMap.clear(); Debug::Progress progress(qComplexIdVec.size()); std::map qComplexLength, tComplexLength; + std::map qComplexIdResult; for (size_t tComplexIdx = 0; tComplexIdx < tComplexIdVec.size(); tComplexIdx++) { unsigned int tComplexId = tComplexIdVec[tComplexIdx]; @@ -393,24 +391,33 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t -#pragma omp parallel num_threads(localThreads) - { std::string result; +#pragma omp parallel num_threads(localThreads) + { + resultToWrite_t result5; + char buffer[32]; + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); +#endif + std::string result; std::map tmpDBKEYut; std::map localComplexMap; std::vector assIdsToDelete; std::map> cmplIdToBestAssId; // cmplId : [assId, alnSum] std::vector selectedAssIDs; - unsigned int thread_idx = 0; -#ifdef OPENMP - thread_idx = static_cast(omp_get_thread_num()); -#endif -#pragma omp for schedule(dynamic, 10) + Coordinate16 qcoords; + Coordinate16 tcoords; + + Matcher::result_t res; +#pragma omp for schedule(dynamic, 1) for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { + + unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); + - Coordinate16 qcoords; - Coordinate16 tcoords; + for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { unsigned int qChainKey = qChainKeys[qChainIdx]; @@ -561,18 +568,28 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t result.push_back('\n'); result5.append(qcomplexIdToName.at(qComplexId) + "\t" + tcomplexIdToName.at(tComplexId) + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); } - resultWriter.writeData(result.c_str(), result.length(), qComplexId); + #pragma omp critical + { + qComplexIdResult[qComplexId]= result; + // resultWriter.writeData(result.c_str(), result.length(), qComplexId); + } + result.clear(); localComplexMap.clear(); tmpDBKEYut.clear(); assIdsToDelete.clear(); cmplIdToBestAssId.clear(); selectedAssIDs.clear(); + } // for end + resultWrite5.writeData(result5.c_str(), result5.length(), 0); + result5.clear(); } // MP end - - resultWrite5.writeData(result5.c_str(), result5.length(), 0); + for (auto &pair : qComplexIdResult){ + resultWriter.writeData(pair.second.c_str(), pair.second.length(), pair.first); + } + resultWriter.close(true); resultWrite5.close(par.dbOut == false); alnDbr.close(); @@ -581,7 +598,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t delete tDbr; } - result5.clear(); + qChainKeyToComplexIdMap.clear(); tChainKeyToComplexIdMap.clear(); qComplexIdToChainKeyMap.clear(); From a8f6588f047ff5e2360dfa33254fe6370a58fd97 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 24 Apr 2024 12:08:30 +0900 Subject: [PATCH 088/160] simple --- src/strucclustutils/filtercomplex.cpp | 60 ++++++++++----------------- 1 file changed, 23 insertions(+), 37 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 634fb8fc..bd33f924 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -210,8 +210,8 @@ unsigned int fillMatchedCoord(float * qdata, float * tdata, ti++; } } - qm.reallocate(mi); - tm.reallocate(mi); + qm.realloc(mi); + tm.realloc(mi); std::copy(qx.begin(), qx.end(), qm.x); std::copy(qy.begin(), qy.end(), qm.y); std::copy(qz.begin(), qz.end(), qm.z); @@ -235,7 +235,7 @@ double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u // if (normlen<=19) { // d0=0.168; -// } + // } // set4final if (normlen<=21) { d0=0.5; @@ -409,7 +409,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t Coordinate16 tcoords; Matcher::result_t res; -#pragma omp for schedule(dynamic, 1) +#pragma omp for schedule(dynamic, 1) for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { @@ -457,43 +457,30 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t break; } + float u[3][3]; + float t[3]; + Coordinates qm(0), tm(0); + fillUArr(retComplex.uString, u); + fillTArr(retComplex.tString, t); + int tChainLen = tDbr->sequenceReader->getSeqLen(tChainDbKey); char *tcadata = tStructDbr->getData(tChainDbKey, thread_idx); size_t tCaLength = tStructDbr->getEntryLen(tChainDbKey); float* tdata = tcoords.read(tcadata, tChainLen, tCaLength); unsigned int normlen = std::min(res.qLen, res.dbLen); - - - if (hasTM(par.filtComplexTmThr, par.covMode, retComplex.qTmScore, retComplex.tTmScore)){ - unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - if (par.filtChainTmThr > 0 ){ - float u[3][3]; - float t[3]; - Coordinates qm(0), tm(0); - fillUArr(retComplex.uString, u); - fillTArr(retComplex.tString, t); - tmpDBKEYut[assId]=retComplex.uString+","+retComplex.tString; - unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); - double chainTm = computeChainTmScore(qm, tm, t, u, match_len, normlen); - double qChainTm = chainTm / qChainLen; - double tChainTm = chainTm / tChainLen; - - if (localComplexMap.find(assId) == localComplexMap.end()) { - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(res.dbKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm); - localComplexMap[assId] = cmplfiltcrit; - } else { - localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm); - } - } - else{ - if (localComplexMap.find(assId) == localComplexMap.end()) { - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(res.dbKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, 1, 1); - localComplexMap[assId] = cmplfiltcrit; - } else { - localComplexMap.at(assId).update(qtotalaln, ttotalaln, 1, 1); - } - } + unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); + Debug(Debug::ERROR) << "match_len"< Date: Wed, 24 Apr 2024 15:39:16 +0900 Subject: [PATCH 089/160] Look at this. ChainTM goes higher than 1 --- src/commons/LocalParameters.cpp | 2 +- src/strucclustutils/filtercomplex.cpp | 54 +++++++++++++-------------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 3d6bb0ff..c3b8b963 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -33,7 +33,7 @@ LocalParameters::LocalParameters() : PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), PARAM_COMPLEX_TM_THRESHOLD(PARAM_COMPLEX_TM_THRESHOLD_ID,"--complex-tm-threshold", "TMscore threshold for filtercomplex", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtComplexTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "per chain TMscore threshold for filtercomplex", "accept alignments satisfying tmscores of all chains > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), - PARAM_SAME_CHAIN_NUM(PARAM_SAME_CHAIN_NUM_ID, "--same-chain-number", "Only Cluster complex with same chain number", "Cluster only same chain numbers(1) or not(0)", typeid(bool), (void *) &sameChainNumber, "") + PARAM_SAME_CHAIN_NUM(PARAM_SAME_CHAIN_NUM_ID, "--only-same-nmer", "Only Cluster complex with same number of chains", "Cluster only Nmer with same N(1) or not(0)", typeid(bool), (void *) &sameChainNumber, "") { PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id"; diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index bd33f924..be25068a 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -241,7 +241,7 @@ double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u d0=0.5; } else { - d0=1.24*pow((normlen-15),1.0/3)-1.8; + d0=(1.24*pow((normlen*1.0-15), 1.0/3)-1.8); } // d0 += 0.8; @@ -415,14 +415,10 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); - - - for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { unsigned int qChainKey = qChainKeys[qChainIdx]; unsigned int qChainDbKey = alnDbr.getId(qChainKey); - if (qChainDbKey == NOT_AVAILABLE_CHAIN_KEY){ Debug(Debug::ERROR)<sequenceReader->getSeqLen(qChainDbKey); char *qcadata = qStructDbr.getData(qChainDbKey, thread_idx); size_t qCaLength = qStructDbr.getEntryLen(qChainDbKey); float* qdata = qcoords.read(qcadata, qChainLen, qCaLength); - char *data = alnDbr.getData(qChainDbKey, thread_idx); - while (*data) { + while (*data != '\0' ) { ComplexDataHandler retComplex = parseScoreComplexResult(data, res); if (!retComplex.isValid){ @@ -451,7 +447,6 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int tChainDbKey = res.dbKey; unsigned int tChainKey = tDbr->sequenceReader->getDbKey(tChainDbKey); tChainDbKey = alnDbr.getId(tChainKey); - if (tChainDbKey == NOT_AVAILABLE_CHAIN_KEY){ // Debug(Debug::ERROR) << "tChainKey"<sequenceReader->getSeqLen(tChainDbKey); char *tcadata = tStructDbr->getData(tChainDbKey, thread_idx); @@ -469,9 +465,8 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t float* tdata = tcoords.read(tcadata, tChainLen, tCaLength); unsigned int normlen = std::min(res.qLen, res.dbLen); unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); - Debug(Debug::ERROR) << "match_len"< 1){ + Debug(Debug::ERROR) << "\nGOOD: Qchain: "< 1){ + Debug(Debug::ERROR) << "GOOD: Tchain: "< Date: Thu, 25 Apr 2024 14:39:28 +0900 Subject: [PATCH 090/160] res.Len seems right --- src/strucclustutils/filtercomplex.cpp | 60 +++++++++++++-------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index be25068a..64e530a0 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -427,16 +427,15 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t result5.append(qcomplexIdToName.at(qComplexId) + "\t" + tcomplexIdToName.at(qComplexId) + "\t1.000000\t1.000000\t1.000000\t1.000000\n"); break; } - char *data = alnDbr.getData(qChainDbKey, thread_idx); - int qChainLen = qDbr->sequenceReader->getSeqLen(qChainDbKey); - char *qcadata = qStructDbr.getData(qChainDbKey, thread_idx); - size_t qCaLength = qStructDbr.getEntryLen(qChainDbKey); - float* qdata = qcoords.read(qcadata, qChainLen, qCaLength); - + char *data = alnDbr.getData(qChainDbKey, thread_idx); while (*data != '\0' ) { ComplexDataHandler retComplex = parseScoreComplexResult(data, res); - + // int qChainLen = qDbr->sequenceReader->getSeqLen(qChainDbKey); + char *qcadata = qStructDbr.getData(qChainDbKey, thread_idx); + size_t qCaLength = qStructDbr.getEntryLen(qChainDbKey); + float* qdata = qcoords.read(qcadata, res.qLen, qCaLength); + if (!retComplex.isValid){ Debug(Debug::ERROR) << "No scorecomplex result provided"; EXIT(EXIT_FAILURE); @@ -459,15 +458,16 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t fillTArr(retComplex.tString, t); tmpDBKEYut[assId]=retComplex.uString+","+retComplex.tString; - int tChainLen = tDbr->sequenceReader->getSeqLen(tChainDbKey); + // int tChainLen = tDbr->sequenceReader->getSeqLen(tChainDbKey); char *tcadata = tStructDbr->getData(tChainDbKey, thread_idx); size_t tCaLength = tStructDbr->getEntryLen(tChainDbKey); - float* tdata = tcoords.read(tcadata, tChainLen, tCaLength); + float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); unsigned int normlen = std::min(res.qLen, res.dbLen); unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); + Debug(Debug::ERROR) << match_len<<"\t"< 1){ - Debug(Debug::ERROR) << "\nGOOD: Qchain: "< 1){ - Debug(Debug::ERROR) << "GOOD: Tchain: "<1){ + // Debug(Debug::ERROR) << "\nGOOD: Qchain: "<1){ + // Debug(Debug::ERROR) << "GOOD: Tchain: "< Date: Thu, 25 Apr 2024 15:20:18 +0900 Subject: [PATCH 091/160] Check if no aligned chain exists --- src/strucclustutils/filtercomplex.cpp | 57 ++++++++++++++++----------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 64e530a0..a5d066ce 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -64,30 +64,43 @@ bool hasTM(float TMThr, int covMode, double qTM, double tTM){ } } -bool hasChainTm(float chainTMThr, int covMode, std::vector &qChainTmScores, std::vector &tChainTmScores) { +bool hasChainTm(float chainTMThr, int covMode, std::vector &qChainTmScores, std::vector &tChainTmScores, unsigned int qChainNum, unsigned int tChainNum) { if (chainTMThr > 0 ){ - for (size_t i = 0; i < qChainTmScores.size(); i++) { - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + if (qChainTmScores.size() &qChainKeys) { - unsigned int qResidueLen = 0; - for (auto qChainKey: qChainKeys) { - size_t id = qDbr->sequenceReader->getId(qChainKey); +unsigned int getComplexResidueLength( IndexReader *Dbr, std::vector &ChainKeys) { + unsigned int ResidueLen = 0; + for (auto ChainKey: ChainKeys) { + size_t id = Dbr->sequenceReader->getId(ChainKey); // Not accessible if (id == NOT_AVAILABLE_CHAIN_KEY) return 0; - qResidueLen += qDbr->sequenceReader->getSeqLen(id); + ResidueLen += Dbr->sequenceReader->getSeqLen(id); } - return qResidueLen; + return ResidueLen; } static void getlookupInfo( @@ -464,7 +477,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); unsigned int normlen = std::min(res.qLen, res.dbLen); unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); - Debug(Debug::ERROR) << match_len<<"\t"< Date: Thu, 25 Apr 2024 16:23:15 +0900 Subject: [PATCH 092/160] DbKey to AlnId/DbId --- src/strucclustutils/filtercomplex.cpp | 35 ++++++++++++--------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index a5d066ce..bdc18395 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -431,9 +431,11 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { unsigned int qChainKey = qChainKeys[qChainIdx]; - unsigned int qChainDbKey = alnDbr.getId(qChainKey); - if (qChainDbKey == NOT_AVAILABLE_CHAIN_KEY){ - Debug(Debug::ERROR)<sequenceReader->getId(qChainKey); + // Debug(Debug::ERROR)<sequenceReader->getSeqLen(qChainDbKey); - char *qcadata = qStructDbr.getData(qChainDbKey, thread_idx); - size_t qCaLength = qStructDbr.getEntryLen(qChainDbKey); + char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); + size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); float* qdata = qcoords.read(qcadata, res.qLen, qCaLength); if (!retComplex.isValid){ @@ -456,24 +457,20 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t data = Util::skipLine(data); unsigned int assId = retComplex.assId; - unsigned int tChainDbKey = res.dbKey; - unsigned int tChainKey = tDbr->sequenceReader->getDbKey(tChainDbKey); - tChainDbKey = alnDbr.getId(tChainKey); - if (tChainDbKey == NOT_AVAILABLE_CHAIN_KEY){ - // Debug(Debug::ERROR) << "tChainKey"<sequenceReader->getId(tChainKey); + if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ break; } - float u[3][3]; float t[3]; Coordinates qm(0), tm(0); fillUArr(retComplex.uString, u); fillTArr(retComplex.tString, t); tmpDBKEYut[assId]=retComplex.uString+","+retComplex.tString; - - // int tChainLen = tDbr->sequenceReader->getSeqLen(tChainDbKey); - char *tcadata = tStructDbr->getData(tChainDbKey, thread_idx); - size_t tCaLength = tStructDbr->getEntryLen(tChainDbKey); + char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); + size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); unsigned int normlen = std::min(res.qLen, res.dbLen); unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); @@ -485,7 +482,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); if (localComplexMap.find(assId) == localComplexMap.end()) { - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(res.dbKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm); + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm); localComplexMap[assId] = cmplfiltcrit; } else { localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm); @@ -497,7 +494,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); - if (!assId_res.second.satisfy(par.covMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr, par.sameChainNumber,qChainKeys.size(), tChainKeys.size())){ + if (!assId_res.second.satisfy(par.covMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr, par.sameChainNumber, qChainKeys.size(), tChainKeys.size())){ assIdsToDelete.push_back(assId_res.first); // if (qComplexId != tComplexId){ // Debug(Debug::WARNING) << "BAD: q: "< Date: Sat, 27 Apr 2024 18:18:38 +0900 Subject: [PATCH 093/160] [MAYBE SOLVED} chainTM --- lib/tmalign/Coordinates.h | 93 ++++++++++------ src/strucclustutils/filtercomplex.cpp | 149 ++++++++++++-------------- 2 files changed, 128 insertions(+), 114 deletions(-) diff --git a/lib/tmalign/Coordinates.h b/lib/tmalign/Coordinates.h index b0f7dcec..80102d33 100644 --- a/lib/tmalign/Coordinates.h +++ b/lib/tmalign/Coordinates.h @@ -2,13 +2,70 @@ // Created by Martin Steinegger on 1/15/21. // +// #ifndef STRUCCLUST_COORDINATES_H +// #define STRUCCLUST_COORDINATES_H +// #include "simd.h" +// #include + +// struct Coordinates{ + // Coordinates(int size) : size(size) { + // x =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); + // y =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); + // z =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); + // allocated = true; + // } + // Coordinates(){ + // allocated = false; + // } + // ~Coordinates(){ + // if(allocated == true){ + // free(x); + // free(y); + // free(z); + // } + // } + // bool allocated; + // float * x; + // float * y; + // float * z; + // int size; + + // void reallocate(int newsize){ + // if (allocated == false) { + // x =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); + // y =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); + // z =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); + // allocated = true; + // size = newsize; + // } else { + // if (newsize > size) { + // float* new_x =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); + // float* new_y =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); + // float* new_z =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); + + // if (x) memcpy(new_x, x, size * sizeof(float)); + // if (y) memcpy(new_y, y, size * sizeof(float)); + // if (z) memcpy(new_z, z, size * sizeof(float)); + + // free(x); + // free(y); + // free(z); + + // x = new_x; + // y = new_y; + // z = new_z; + // size = newsize; + // } + // } + // } +// }; +// #endif //STRUCCLUST_COORDINATES_H #ifndef STRUCCLUST_COORDINATES_H #define STRUCCLUST_COORDINATES_H #include "simd.h" -#include struct Coordinates{ - Coordinates(int size) : size(size) { + Coordinates(int size){ x =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); y =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); z =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); @@ -28,35 +85,5 @@ struct Coordinates{ float * x; float * y; float * z; - int size; - - void realloc(int newsize){ - if (allocated == false) { - x =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); - y =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); - z =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); - allocated = true; - size = newsize; - } else { - if (newsize > size) { - float* new_x =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); - float* new_y =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); - float* new_z =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); - - if (x) memcpy(new_x, x, size * sizeof(float)); - if (y) memcpy(new_y, y, size * sizeof(float)); - if (z) memcpy(new_z, z, size * sizeof(float)); - - free(x); - free(y); - free(z); - - x = new_x; - y = new_y; - z = new_z; - size = newsize; - } - } - } }; -#endif //STRUCCLUST_COORDINATES_H +#endif //STRUCCLUST_COORDINATES_H \ No newline at end of file diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index bdc18395..17a933ba 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -14,6 +14,7 @@ #include "CalcProbTP.h" #include + #ifdef OPENMP #include #endif @@ -189,29 +190,33 @@ void fillTArr(const std::string &tString, float (&t)[3]) { } } -unsigned int fillMatchedCoord(float * qdata, float * tdata, +unsigned int cigarToAlignedLength(const std::string &cigar){ + std::string backtrace = Matcher::uncompressAlignment(cigar); + unsigned int alni = 0; + for (size_t btPos = 0; btPos < backtrace.size(); btPos++) { + if (backtrace[btPos] == 'M') { + alni++; + } + } + return alni; +} + +void fillMatchedCoord(float * qdata, float * tdata, Coordinates &qm, Coordinates &tm, const std::string &cigar, int qStartPos, int tStartPos, int qLen, int tLen) { - std::vector qx, qy, qz, tx, ty, tz; int qi = qStartPos; int ti = tStartPos; - unsigned int qXPos = 0; - unsigned int qYPos = qLen; - unsigned int qZPos = qLen*2; - unsigned int tXPos = 0; - unsigned int tYPos = tLen; - unsigned int tZPos = tLen*2; int mi = 0; std::string backtrace = Matcher::uncompressAlignment(cigar); for (size_t btPos = 0; btPos < backtrace.size(); btPos++) { if (backtrace[btPos] == 'M') { - qx.push_back(qdata[qXPos + qi]); - qy.push_back(qdata[qYPos + qi]); - qz.push_back(qdata[qZPos + qi]); - tx.push_back(tdata[tXPos + ti]); - ty.push_back(tdata[tYPos + ti]); - tz.push_back(tdata[tZPos + ti]); + qm.x[mi] = qdata[qi]; + qm.y[mi] = qdata[qLen + qi]; + qm.z[mi] = qdata[2*qLen + qi]; + tm.x[mi] = tdata[ti]; + tm.y[mi] = tdata[tLen + ti]; + tm.z[mi] = tdata[2*tLen + ti]; qi++; ti++; mi++; @@ -223,53 +228,30 @@ unsigned int fillMatchedCoord(float * qdata, float * tdata, ti++; } } - qm.realloc(mi); - tm.realloc(mi); - std::copy(qx.begin(), qx.end(), qm.x); - std::copy(qy.begin(), qy.end(), qm.y); - std::copy(qz.begin(), qz.end(), qm.z); - std::copy(tx.begin(), tx.end(), tm.x); - std::copy(ty.begin(), ty.end(), tm.y); - std::copy(tz.begin(), tz.end(), tm.z); - qx.clear(); - qy.clear(); - qz.clear(); - tx.clear(); - ty.clear(); - tz.clear(); - - return mi; } -double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u[3][3], unsigned int mlen, int normlen) { +double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u[3][3], unsigned int alnLen, int tLen) { double tmscore = 0; - float d0; - // float score_d8 = 1.5*pow(normlen,0.3)+3.5; - - // if (normlen<=19) { - // d0=0.168; + double tmalnScore = 0; + // float d0; + // if (normlen<=21) { + // d0=0.5; // } - // set4final - if (normlen<=21) { - d0=0.5; - } - else { - d0=(1.24*pow((normlen*1.0-15), 1.0/3)-1.8); - } - // d0 += 0.8; - - Coordinates tmt(mlen); - BasicFunction::do_rotation(tm, tmt, mlen, t, u); - + // else { + // d0=(1.24*pow((normlen*1.0-15), 1.0/3)-1.8); + // } + + float d0 = 1.24*(cbrt(tLen-15)) -1.8; float d02 = d0*d0; - // float score_d82 = score_d8*score_d8; - for (unsigned int k=0; k &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); @@ -433,9 +414,8 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int qChainKey = qChainKeys[qChainIdx]; unsigned int qChainAlnId = alnDbr.getId(qChainKey); unsigned int qChainDbId = qDbr->sequenceReader->getId(qChainKey); - // Debug(Debug::ERROR)<sequenceReader->getId(tChainKey); + //if target is monomer, break to be singleton if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ break; } float u[3][3]; float t[3]; - Coordinates qm(0), tm(0); fillUArr(retComplex.uString, u); fillTArr(retComplex.tString, t); tmpDBKEYut[assId]=retComplex.uString+","+retComplex.tString; @@ -473,9 +453,10 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); unsigned int normlen = std::min(res.qLen, res.dbLen); - unsigned int match_len = fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); - // Debug(Debug::ERROR) << match_len<<"\t"<= 0.9 && assId_res.second.qTM >= 0.9 && assId_res.second.alignedQChainTmScores.size() == qChainKeys.size() && qChainKeys.size()==tChainKeys.size()){ + // Debug(Debug::WARNING) << "BAD: q: "< Date: Tue, 7 May 2024 14:59:56 +0900 Subject: [PATCH 094/160] reset --- lib/tmalign/Coordinates.h | 58 --------------------- src/strucclustutils/createcomplexreport.cpp | 1 - 2 files changed, 59 deletions(-) diff --git a/lib/tmalign/Coordinates.h b/lib/tmalign/Coordinates.h index 80102d33..bcdf3ffe 100644 --- a/lib/tmalign/Coordinates.h +++ b/lib/tmalign/Coordinates.h @@ -2,64 +2,6 @@ // Created by Martin Steinegger on 1/15/21. // -// #ifndef STRUCCLUST_COORDINATES_H -// #define STRUCCLUST_COORDINATES_H -// #include "simd.h" -// #include - -// struct Coordinates{ - // Coordinates(int size) : size(size) { - // x =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); - // y =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); - // z =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); - // allocated = true; - // } - // Coordinates(){ - // allocated = false; - // } - // ~Coordinates(){ - // if(allocated == true){ - // free(x); - // free(y); - // free(z); - // } - // } - // bool allocated; - // float * x; - // float * y; - // float * z; - // int size; - - // void reallocate(int newsize){ - // if (allocated == false) { - // x =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); - // y =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); - // z =(float*) mem_align(ALIGN_FLOAT, (size+VECSIZE_FLOAT)*sizeof(float)); - // allocated = true; - // size = newsize; - // } else { - // if (newsize > size) { - // float* new_x =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); - // float* new_y =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); - // float* new_z =(float*) mem_align(ALIGN_FLOAT, (newsize+VECSIZE_FLOAT)*sizeof(float)); - - // if (x) memcpy(new_x, x, size * sizeof(float)); - // if (y) memcpy(new_y, y, size * sizeof(float)); - // if (z) memcpy(new_z, z, size * sizeof(float)); - - // free(x); - // free(y); - // free(z); - - // x = new_x; - // y = new_y; - // z = new_z; - // size = newsize; - // } - // } - // } -// }; -// #endif //STRUCCLUST_COORDINATES_H #ifndef STRUCCLUST_COORDINATES_H #define STRUCCLUST_COORDINATES_H #include "simd.h" diff --git a/src/strucclustutils/createcomplexreport.cpp b/src/strucclustutils/createcomplexreport.cpp index be769cdd..0346ad00 100644 --- a/src/strucclustutils/createcomplexreport.cpp +++ b/src/strucclustutils/createcomplexreport.cpp @@ -164,7 +164,6 @@ int createcomplexreport(int argc, const char **argv, const Command &command) { compAlns[compAlnIdx].qChainNames.emplace_back(queryChainName); compAlns[compAlnIdx].tChainNames.emplace_back(targetChainName); } - } // while end } for (size_t compAlnIdx = 0; compAlnIdx < compAlns.size(); compAlnIdx++) { From b31f2ada5e19ed35623668019d9578c0cddc2575 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 7 May 2024 15:01:57 +0900 Subject: [PATCH 095/160] reset --- lib/tmalign/Coordinates.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tmalign/Coordinates.h b/lib/tmalign/Coordinates.h index bcdf3ffe..b929987f 100644 --- a/lib/tmalign/Coordinates.h +++ b/lib/tmalign/Coordinates.h @@ -28,4 +28,4 @@ struct Coordinates{ float * y; float * z; }; -#endif //STRUCCLUST_COORDINATES_H \ No newline at end of file +#endif //STRUCCLUST_COORDINATES_H From cb0a43ec0938d49d2c6d9b4f8e220c58f8e6f070 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Fri, 10 May 2024 21:10:42 +0900 Subject: [PATCH 096/160] minor --- data/complexsearch.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/data/complexsearch.sh b/data/complexsearch.sh index d721d5ac..0ce67fea 100644 --- a/data/complexsearch.sh +++ b/data/complexsearch.sh @@ -28,7 +28,6 @@ if [ "$PREFMODE" != "EXHAUSTIVE" ]; then fi RESULT="${TMP_PATH}/result_expand_aligned" fi - if notExists "${TMP_PATH}/complex_result.dbtype"; then # shellcheck disable=SC2086 $MMSEQS scorecomplex "${QUERYDB}" "${TARGETDB}" "${RESULT}" "${OUTPUT}" ${SCORECOMPLEX_PAR} \ From 5b10e67fc048ab97f6cffd7d8aa67dd60f74f2f0 Mon Sep 17 00:00:00 2001 From: SooyoungCha <97579193+ChaSooyoung@users.noreply.github.com> Date: Mon, 27 May 2024 18:13:03 +0900 Subject: [PATCH 097/160] Update filtercomplex.cpp --- src/strucclustutils/filtercomplex.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 17a933ba..cb0fbbf7 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -428,7 +428,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t ComplexDataHandler retComplex = parseScoreComplexResult(data, res); char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); - float* qdata = qcoords.read(qcadata, res.qLen, qCaLength); + float* qdata = qcoords.read(qcadata, qCaLength, qCaLength); if (!retComplex.isValid){ Debug(Debug::ERROR) << "No scorecomplex result provided"; @@ -451,7 +451,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t tmpDBKEYut[assId]=retComplex.uString+","+retComplex.tString; char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); - float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); + float* tdata = tcoords.read(tcadata, tCaLength, tCaLength); unsigned int normlen = std::min(res.qLen, res.dbLen); unsigned int alnLen = cigarToAlignedLength(res.backtrace); Coordinates qm(alnLen), tm(alnLen); From e8469df013a78e498a92b4a04998e3d08c1e2402 Mon Sep 17 00:00:00 2001 From: SooyoungCha <97579193+ChaSooyoung@users.noreply.github.com> Date: Mon, 27 May 2024 19:48:58 +0900 Subject: [PATCH 098/160] Update filtercomplex.cpp --- src/strucclustutils/filtercomplex.cpp | 44 ++------------------------- 1 file changed, 2 insertions(+), 42 deletions(-) diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index cb0fbbf7..37257ca6 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -428,7 +428,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t ComplexDataHandler retComplex = parseScoreComplexResult(data, res); char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); - float* qdata = qcoords.read(qcadata, qCaLength, qCaLength); + float* qdata = qcoords.read(qcadata, res.qLen, qCaLength); if (!retComplex.isValid){ Debug(Debug::ERROR) << "No scorecomplex result provided"; @@ -451,7 +451,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t tmpDBKEYut[assId]=retComplex.uString+","+retComplex.tString; char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); - float* tdata = tcoords.read(tcadata, tCaLength, tCaLength); + float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); unsigned int normlen = std::min(res.qLen, res.dbLen); unsigned int alnLen = cigarToAlignedLength(res.backtrace); Coordinates qm(alnLen), tm(alnLen); @@ -476,47 +476,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); if (!assId_res.second.satisfy(par.covMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr, par.sameChainNumber, qChainKeys.size(), tChainKeys.size())){ assIdsToDelete.push_back(assId_res.first); - // if (qComplexId != tComplexId){ - // #pragma omp critical - // { - // if(assId_res.second.tTM >= 0.9 && assId_res.second.qTM >= 0.9 && assId_res.second.alignedQChainTmScores.size() == qChainKeys.size() && qChainKeys.size()==tChainKeys.size()){ - // Debug(Debug::WARNING) << "BAD: q: "<1){ - // Debug(Debug::ERROR) << "\nGOOD: Qchain: "<1){ - // Debug(Debug::ERROR) << "GOOD: Tchain: "< Date: Tue, 28 May 2024 16:27:17 +0900 Subject: [PATCH 099/160] not done, but added vector check --- src/commons/LocalParameters.cpp | 8 +- src/commons/LocalParameters.h | 9 +- src/strucclustutils/filtercomplex.cpp | 285 ++++++++++++++++---------- src/workflow/ComplexCluster.cpp | 1 + 4 files changed, 189 insertions(+), 114 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index c3b8b963..10df864f 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -33,7 +33,7 @@ LocalParameters::LocalParameters() : PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), PARAM_COMPLEX_TM_THRESHOLD(PARAM_COMPLEX_TM_THRESHOLD_ID,"--complex-tm-threshold", "TMscore threshold for filtercomplex", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtComplexTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "per chain TMscore threshold for filtercomplex", "accept alignments satisfying tmscores of all chains > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), - PARAM_SAME_CHAIN_NUM(PARAM_SAME_CHAIN_NUM_ID, "--only-same-nmer", "Only Cluster complex with same number of chains", "Cluster only Nmer with same N(1) or not(0)", typeid(bool), (void *) &sameChainNumber, "") + PARAM_FILTER_MODE(PARAM_FILTER_MODE_ID, "--filter-mode", "Filter mode", "0: Interface\n1: Conformation\n2: loose", typeid(int), (void *) &filterMode, "[0-2]{0}$", MMseqsParameter::COMMAND_CLUST) { PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id"; @@ -189,8 +189,7 @@ LocalParameters::LocalParameters() : filtercomplex.push_back(&PARAM_COV_MODE); filtercomplex.push_back(&PARAM_COMPLEX_TM_THRESHOLD); filtercomplex.push_back(&PARAM_CHAIN_TM_THRESHOLD); - filtercomplex.push_back(&PARAM_SAME_CHAIN_NUM); - + filtercomplex.push_back(&PARAM_FILTER_MODE); // createcomplexreport createcomplexreport.push_back(&PARAM_DB_OUTPUT); @@ -254,8 +253,7 @@ LocalParameters::LocalParameters() : citations.emplace(CITATION_FOLDSEEK, "van Kempen, M., Kim, S.S., Tumescheit, C., Mirdita, M., Lee, J., Gilchrist, C.L.M., Söding, J., and Steinegger, M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)"); filtComplexTmThr = 0.0; filtChainTmThr = 0.0; - sameChainNumber = 0; - + filterMode = 0; //rewrite param vals. PARAM_FORMAT_OUTPUT.description = "Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,fident,nident,qstart,qend,qlen\ntstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,mismatch,qcov,tcov\nqset,qsetid,tset,tsetid,taxid,taxname,taxlineage,\nlddt,lddtfull,qca,tca,t,u,qtmscore,ttmscore,alntmscore,rmsd,prob\ncomplexqtmscore,complexttmscore,complexu,complext,complexassignid\n"; diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 15487e6f..5c50bfa3 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -79,6 +79,11 @@ class LocalParameters : public Parameters { static const int PDB_OUTPUT_MODE_SINGLECHAIN = 1; static const int PDB_OUTPUT_MODE_COMPLEX = 2; + // filter mode + static const int FILTER_MODE_INTERFACE = 0; + static const int FILTER_MODE_CONFORMATION = 1; + static const int FILTER_MODE_LOOSE = 2; + // TODO static const unsigned int FORMAT_ALIGNMENT_PDB_SUPERPOSED = 5; std::vector strucclust; @@ -127,7 +132,7 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_PDB_OUTPUT_MODE) PARAMETER(PARAM_COMPLEX_TM_THRESHOLD) PARAMETER(PARAM_CHAIN_TM_THRESHOLD) - PARAMETER(PARAM_SAME_CHAIN_NUM) + PARAMETER(PARAM_FILTER_MODE) int prefMode; float tmScoreThr; @@ -153,7 +158,7 @@ class LocalParameters : public Parameters { int pdbOutputMode; float filtComplexTmThr; float filtChainTmThr; - bool sameChainNumber; + int filterMode; static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy, bool &needQCa, bool &needTCa, bool &needTMaligner, diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 37257ca6..836f4fbe 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -32,85 +32,81 @@ unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { case Parameters::COV_MODE_LENGTH_TARGET : case Parameters::COV_MODE_LENGTH_SHORTER : return 0; - default: - return 0; - } -} - -bool hasChainnum(bool sameChainNum, int qChainNum, int tChainNum){ - switch (sameChainNum){ - case 1: - if (qChainNum != tChainNum){ - return false; - }else{return true;} - case 0: - return true; - } -} - -bool hasTM(float TMThr, int covMode, double qTM, double tTM){ - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - return ((qTM>= TMThr) && (tTM >= TMThr)); - case Parameters::COV_MODE_TARGET: - return (tTM >= TMThr); - case Parameters::COV_MODE_QUERY: - return (qTM >= TMThr); - case Parameters::COV_MODE_LENGTH_QUERY : - case Parameters::COV_MODE_LENGTH_TARGET : - case Parameters::COV_MODE_LENGTH_SHORTER : - return true; - default: - return true; } } -bool hasChainTm(float chainTMThr, int covMode, std::vector &qChainTmScores, std::vector &tChainTmScores, unsigned int qChainNum, unsigned int tChainNum) { - if (chainTMThr > 0 ){ - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - if (qChainTmScores.size() &qChainTmScores, std::vector &tChainTmScores, unsigned int qChainNum, unsigned int tChainNum) { +// if (chainTMThr > 0 ){ +// switch (covMode) { +// case Parameters::COV_MODE_BIDIRECTIONAL: +// if (qChainTmScores.size()= 0 && qavgCoords[1]*tavgCoords[1] >= 0 && qavgCoords[2]*tavgCoords[2] >= 0 ); } -struct ComplexFilterCriteria { +class ComplexFilterCriteria { +public: + unsigned int dbKey; + unsigned int qTotalAlnLen; + unsigned int tTotalAlnLen; + float qCov; + float tCov; + double qTM; + double tTM; + bool sameCoord; + std::vector alignedQChainTmScores; + std::vector alignedTChainTmScores; ComplexFilterCriteria() {} - ComplexFilterCriteria(unsigned int dbKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qTM, double tTM, double qChainTm, double tChainTm) : - dbKey(dbKey), qTotalAlnLen(qTotalAlnLen), tTotalAlnLen(tTotalAlnLen), qTM(qTM), tTM(tTM) { + ComplexFilterCriteria(unsigned int dbKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qTM, double tTM, double qChainTm, double tChainTm, bool sameCoord) : + dbKey(dbKey), qTotalAlnLen(qTotalAlnLen), tTotalAlnLen(tTotalAlnLen), qTM(qTM), tTM(tTM), sameCoord(sameCoord) { alignedQChainTmScores.push_back(qChainTm); alignedTChainTmScores.push_back(tChainTm); } @@ -119,36 +115,86 @@ struct ComplexFilterCriteria { alignedTChainTmScores.clear(); } - bool satisfy(int covMode, float covThr, float TMThr, float chainTMThr, bool sameChainNum, int qChainNum, int tChainNum ) { + bool hasTM(float TMThr, int covMode, int filterMode){ + switch (filterMode){ + case LocalParameters::FILTER_MODE_INTERFACE: + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return ((qTM>= TMThr) && (tTM >= TMThr)); + case Parameters::COV_MODE_TARGET: + return (tTM >= TMThr); + case Parameters::COV_MODE_QUERY: + return (qTM >= TMThr); + case Parameters::COV_MODE_LENGTH_QUERY : + case Parameters::COV_MODE_LENGTH_TARGET : + case Parameters::COV_MODE_LENGTH_SHORTER : + return true; + } + } + bool hasChainNum(int covMode, int filterMode, int qChainNum, int tChainNum ){ + switch (filterMode){ + case LocalParameters::FILTER_MODE_INTERFACE: + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return (alignedQChainTmScores.size()==qChainNum && qChainNum==tChainNum); + case Parameters::COV_MODE_TARGET: + return (alignedTChainTmScores.size()==tChainNum); + case Parameters::COV_MODE_QUERY: + return (alignedQChainTmScores.size()==qChainNum); + case Parameters::COV_MODE_LENGTH_QUERY : + case Parameters::COV_MODE_LENGTH_TARGET : + case Parameters::COV_MODE_LENGTH_SHORTER : + return true; + } + case LocalParameters::FILTER_MODE_CONFORMATION: + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return (qChainNum==tChainNum); + default: + return true; + } + case LocalParameters::FILTER_MODE_LOOSE: + return true; + + } + } + + bool hasMatchedCoord(int filterMode){ + switch (filterMode) { + case LocalParameters::FILTER_MODE_INTERFACE: + return (sameCoord); + case LocalParameters::FILTER_MODE_CONFORMATION: + case LocalParameters::FILTER_MODE_LOOSE: + return true; + } + } + + bool satisfy(int covMode, int filterMode, float covThr, float TMThr, float chainTMThr, int qChainNum, int tChainNum ) { + //TODO + // add filtermode in hasTM. + // find other criteria for other filtermodes const bool covOK = Util::hasCoverage(covThr, covMode, qCov, tCov); - const bool TMOK = hasTM(TMThr, covMode, qTM, tTM); - const bool chainTMOK = hasChainTm(chainTMThr, covMode, alignedQChainTmScores, alignedTChainTmScores, qChainNum, tChainNum); - const bool numOK = hasChainnum(sameChainNum, qChainNum, tChainNum); - return (covOK && TMOK && chainTMOK && numOK); + const bool TMOK = hasTM(TMThr, covMode, filterMode); + const bool chainNumOK = hasChainNum(covMode, filterMode, qChainNum, tChainNum); + const bool coordOK = hasMatchedCoord(filterMode); + // const bool chainTMOK = hasChainTm(chainTMThr, covMode, alignedQChainTmScores, alignedTChainTmScores, qChainNum, tChainNum); + // const bool numOK = hasChainnum(sameChainNum, qChainNum, tChainNum); + // return (covOK && TMOK && chainTMOK && numOK); + return (covOK && TMOK && chainNumOK && coordOK); } - void update(unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qChainTm, double tChainTm) { + void update(unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qChainTm, double tChainTm, bool sameCoord) { this->qTotalAlnLen += qTotalAlnLen; this->tTotalAlnLen += tTotalAlnLen; this->alignedQChainTmScores.push_back(qChainTm); this->alignedTChainTmScores.push_back(tChainTm); + this->sameCoord *= sameCoord; } void calcCov(unsigned int qLen, unsigned int tLen) { qCov = static_cast(qTotalAlnLen) / static_cast(qLen); tCov = static_cast(tTotalAlnLen) / static_cast(tLen); } - - unsigned int dbKey; - unsigned int qTotalAlnLen; - unsigned int tTotalAlnLen; - float qCov; - float tCov; - double qTM; - double tTM; - - std::vector alignedQChainTmScores; - std::vector alignedTChainTmScores; }; void fillUArr(const std::string &uString, float (&u)[3][3]) { @@ -232,28 +278,52 @@ void fillMatchedCoord(float * qdata, float * tdata, double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u[3][3], unsigned int alnLen, int tLen) { double tmscore = 0; - double tmalnScore = 0; - // float d0; - // if (normlen<=21) { - // d0=0.5; - // } - // else { - // d0=(1.24*pow((normlen*1.0-15), 1.0/3)-1.8); - // } - float d0 = 1.24*(cbrt(tLen-15)) -1.8; float d02 = d0*d0; Coordinates tmt(alnLen); BasicFunction::do_rotation(tm, tmt, alnLen, t, u); - for (unsigned int k=0; k &ChainKeys) { unsigned int ResidueLen = 0; @@ -452,7 +522,6 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); - unsigned int normlen = std::min(res.qLen, res.dbLen); unsigned int alnLen = cigarToAlignedLength(res.backtrace); Coordinates qm(alnLen), tm(alnLen); fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); @@ -461,12 +530,14 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t double tChainTm = chainTm/ res.dbLen; unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - + float qAvgCoord[3], tAvgCoord[3]; + getAverageCoord(qdata, qAvgCoord, res.qLen, tdata, tAvgCoord, res.dbLen, t, u) ; + bool coordSame = isSameCoord(qAvgCoord, tAvgCoord ); if (localComplexMap.find(assId) == localComplexMap.end()) { - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm); + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm, coordSame); localComplexMap[assId] = cmplfiltcrit; } else { - localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm); + localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm, coordSame); } } // while end } @@ -474,7 +545,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); - if (!assId_res.second.satisfy(par.covMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr, par.sameChainNumber, qChainKeys.size(), tChainKeys.size())){ + if (!assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr, qChainKeys.size(), tChainKeys.size())){ assIdsToDelete.push_back(assId_res.first); } } diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index 79ef80b4..bfc98ab7 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -12,6 +12,7 @@ void setComplexClusterDefaults(LocalParameters *p) { p->covThr = 0.8; p->filtComplexTmThr = 0.5; // FIX p->filtChainTmThr=0.0; // FIX + p->filterMode=0; p->covMode = 1; p->clusteringMode = Parameters::GREEDY; p->removeTmpFiles = true; From e06bc5087cfb0b4a3664484051b1e94f0ef71a07 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Sat, 8 Jun 2024 21:56:25 +0900 Subject: [PATCH 100/160] octant --- data/complexcluster.sh | 2 +- src/commons/LocalParameters.cpp | 3 - src/commons/LocalParameters.h | 2 - src/strucclustutils/filtercomplex.cpp | 282 +++++++++++++++++++------- src/workflow/ComplexCluster.cpp | 2 +- 5 files changed, 209 insertions(+), 82 deletions(-) diff --git a/data/complexcluster.sh b/data/complexcluster.sh index 57aadd75..0ba0b066 100644 --- a/data/complexcluster.sh +++ b/data/complexcluster.sh @@ -64,7 +64,7 @@ buldCmplhDb(){ for (k = 2; k < length(words)+1; k++) { headerstring = headerstring words[k]" " } - if (!(output_string not in gogo)){ + if (!(output_string in gogo)){ print INDEXVAL"\t"output_string" "headerstring INDEXVAL++ } diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 10df864f..1ef9a4b4 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -32,7 +32,6 @@ LocalParameters::LocalParameters() : PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"), PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), PARAM_COMPLEX_TM_THRESHOLD(PARAM_COMPLEX_TM_THRESHOLD_ID,"--complex-tm-threshold", "TMscore threshold for filtercomplex", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtComplexTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), - PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "per chain TMscore threshold for filtercomplex", "accept alignments satisfying tmscores of all chains > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), PARAM_FILTER_MODE(PARAM_FILTER_MODE_ID, "--filter-mode", "Filter mode", "0: Interface\n1: Conformation\n2: loose", typeid(int), (void *) &filterMode, "[0-2]{0}$", MMseqsParameter::COMMAND_CLUST) { @@ -188,7 +187,6 @@ LocalParameters::LocalParameters() : filtercomplex.push_back(&PARAM_C); filtercomplex.push_back(&PARAM_COV_MODE); filtercomplex.push_back(&PARAM_COMPLEX_TM_THRESHOLD); - filtercomplex.push_back(&PARAM_CHAIN_TM_THRESHOLD); filtercomplex.push_back(&PARAM_FILTER_MODE); // createcomplexreport @@ -252,7 +250,6 @@ LocalParameters::LocalParameters() : eValueThrExpandComplex = 10000.0; citations.emplace(CITATION_FOLDSEEK, "van Kempen, M., Kim, S.S., Tumescheit, C., Mirdita, M., Lee, J., Gilchrist, C.L.M., Söding, J., and Steinegger, M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)"); filtComplexTmThr = 0.0; - filtChainTmThr = 0.0; filterMode = 0; //rewrite param vals. diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 5c50bfa3..a2679852 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -131,7 +131,6 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_INPUT_FORMAT) PARAMETER(PARAM_PDB_OUTPUT_MODE) PARAMETER(PARAM_COMPLEX_TM_THRESHOLD) - PARAMETER(PARAM_CHAIN_TM_THRESHOLD) PARAMETER(PARAM_FILTER_MODE) int prefMode; @@ -157,7 +156,6 @@ class LocalParameters : public Parameters { int inputFormat; int pdbOutputMode; float filtComplexTmThr; - float filtChainTmThr; int filterMode; static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtercomplex.cpp index 836f4fbe..f060cff8 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtercomplex.cpp @@ -35,17 +35,6 @@ unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { } } -// bool hasChainnum(bool sameChainNum, int qChainNum, int tChainNum){ -// switch (sameChainNum){ -// case 1: -// if (qChainNum != tChainNum){ -// return false; -// }else{return true;} -// case 0: -// return true; -// } -// } - // bool hasChainTm(float chainTMThr, int covMode, std::vector &qChainTmScores, std::vector &tChainTmScores, unsigned int qChainNum, unsigned int tChainNum) { // if (chainTMThr > 0 ){ // switch (covMode) { @@ -88,8 +77,19 @@ unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { // return true; // } +int determineOctant(const float coord[3]) { + if (coord[0] >= 0 && coord[1] >= 0 && coord[2] >= 0) return 0; + if (coord[0] < 0 && coord[1] >= 0 && coord[2] >= 0) return 1; + if (coord[0] < 0 && coord[1] < 0 && coord[2] >= 0) return 2; + if (coord[0] >= 0 && coord[1] < 0 && coord[2] >= 0) return 3; + if (coord[0] >= 0 && coord[1] >= 0 && coord[2] < 0) return 4; + if (coord[0] < 0 && coord[1] >= 0 && coord[2] < 0) return 5; + if (coord[0] < 0 && coord[1] < 0 && coord[2] < 0) return 6; + if (coord[0] >= 0 && coord[1] < 0 && coord[2] < 0) return 7; +} + static bool isSameCoord(float qavgCoords[3], float tavgCoords[3]){ - return (qavgCoords[0]*tavgCoords[0] >= 0 && qavgCoords[1]*tavgCoords[1] >= 0 && qavgCoords[2]*tavgCoords[2] >= 0 ); + return(determineOctant(qavgCoords) == determineOctant(tavgCoords)) ; } class ComplexFilterCriteria { @@ -102,14 +102,30 @@ class ComplexFilterCriteria { double qTM; double tTM; bool sameCoord; + std::vector qOctaCount; + std::vector tOctaCount; std::vector alignedQChainTmScores; std::vector alignedTChainTmScores; + std::vector qChainKeys; + std::vector tChainKeys; + float refCoord[3]; + float t[3]; + float u[3][3]; + + ComplexFilterCriteria() {} - ComplexFilterCriteria(unsigned int dbKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qTM, double tTM, double qChainTm, double tChainTm, bool sameCoord) : - dbKey(dbKey), qTotalAlnLen(qTotalAlnLen), tTotalAlnLen(tTotalAlnLen), qTM(qTM), tTM(tTM), sameCoord(sameCoord) { - alignedQChainTmScores.push_back(qChainTm); - alignedTChainTmScores.push_back(tChainTm); - } + ComplexFilterCriteria(unsigned int dbKey, std::vector &qChainKeys, std::vector &tChainKeys, double qTM, double tTM, float tstring[3], float ustring[3][3]) : + dbKey(dbKey), qTM(qTM), tTM(tTM), qChainKeys(qChainKeys), tChainKeys(tChainKeys), qTotalAlnLen(0), tTotalAlnLen(0) { + std::copy(tstring, tstring + 3, t); + for (int i = 0; i < 3; i++) { + std::copy(ustring[i], ustring[i] + 3, u[i]); + } + for (int i = 0; i < 8; i++) { + qOctaCount.push_back(0); + tOctaCount.push_back(0); + } + sameCoord = 1; + } ~ComplexFilterCriteria() { alignedQChainTmScores.clear(); alignedTChainTmScores.clear(); @@ -118,10 +134,11 @@ class ComplexFilterCriteria { bool hasTM(float TMThr, int covMode, int filterMode){ switch (filterMode){ case LocalParameters::FILTER_MODE_INTERFACE: + case LocalParameters::FILTER_MODE_LOOSE: switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: return ((qTM>= TMThr) && (tTM >= TMThr)); - case Parameters::COV_MODE_TARGET: + case Parameters::COV_MODE_TARGET: return (tTM >= TMThr); case Parameters::COV_MODE_QUERY: return (qTM >= TMThr); @@ -130,65 +147,129 @@ class ComplexFilterCriteria { case Parameters::COV_MODE_LENGTH_SHORTER : return true; } + // case LocalParameters::FILTER_MODE_CONFORMATION: + //TODO + //For 1: maybe, check the minimum chain tmscore among all chain-chain(not new) tm + } } - bool hasChainNum(int covMode, int filterMode, int qChainNum, int tChainNum ){ - switch (filterMode){ - case LocalParameters::FILTER_MODE_INTERFACE: - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - return (alignedQChainTmScores.size()==qChainNum && qChainNum==tChainNum); - case Parameters::COV_MODE_TARGET: - return (alignedTChainTmScores.size()==tChainNum); - case Parameters::COV_MODE_QUERY: - return (alignedQChainTmScores.size()==qChainNum); - case Parameters::COV_MODE_LENGTH_QUERY : - case Parameters::COV_MODE_LENGTH_TARGET : - case Parameters::COV_MODE_LENGTH_SHORTER : - return true; - } - case LocalParameters::FILTER_MODE_CONFORMATION: - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - return (qChainNum==tChainNum); - default: - return true; - } - case LocalParameters::FILTER_MODE_LOOSE: - return true; - } - } + // bool hasChainNum(int covMode, int filterMode, int qChainNum, int tChainNum ){ + // switch (filterMode){ + // case LocalParameters::FILTER_MODE_INTERFACE: + // switch (covMode) { + // case Parameters::COV_MODE_BIDIRECTIONAL: + // return (alignedQChainTmScores.size()==qChainNum && qChainNum==tChainNum); + // case Parameters::COV_MODE_TARGET: + // return (alignedTChainTmScores.size()==tChainNum); + // case Parameters::COV_MODE_QUERY: + // return (alignedQChainTmScores.size()==qChainNum); + // case Parameters::COV_MODE_LENGTH_QUERY : + // case Parameters::COV_MODE_LENGTH_TARGET : + // case Parameters::COV_MODE_LENGTH_SHORTER : + // return true; + // } + // case LocalParameters::FILTER_MODE_CONFORMATION: + // switch (covMode) { + // case Parameters::COV_MODE_BIDIRECTIONAL: + // return (qChainNum==tChainNum); + // default: + // return true; + // } + // case LocalParameters::FILTER_MODE_LOOSE: + // return true; + + // } + // } bool hasMatchedCoord(int filterMode){ switch (filterMode) { case LocalParameters::FILTER_MODE_INTERFACE: - return (sameCoord); + return (sameCoord == 1); case LocalParameters::FILTER_MODE_CONFORMATION: case LocalParameters::FILTER_MODE_LOOSE: return true; } } - bool satisfy(int covMode, int filterMode, float covThr, float TMThr, float chainTMThr, int qChainNum, int tChainNum ) { - //TODO - // add filtermode in hasTM. - // find other criteria for other filtermodes + bool hasMatchedOcta(int filterMode, int covMode, std::vector &qOctaCount, std::vector &tOctaCount){ + if (qChainKeys.size()>= 1){ + switch (filterMode) { + case LocalParameters::FILTER_MODE_INTERFACE: + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + for (size_t i = 0; i < 8; i++) { + if (qOctaCount[i] != tOctaCount[i]) { + return false; + } + } + case Parameters::COV_MODE_TARGET: + for (size_t i = 0; i < 8; i++) { + if (qOctaCount[i] < tOctaCount[i]) { + return false; + } + } + return true; + case Parameters::COV_MODE_QUERY: + for (size_t i = 0; i < 8; i++) { + if (qOctaCount[i] > tOctaCount[i]) { + return false; + } + } + return true; + case Parameters::COV_MODE_LENGTH_QUERY : + case Parameters::COV_MODE_LENGTH_TARGET : + case Parameters::COV_MODE_LENGTH_SHORTER : + return true; + } + case LocalParameters::FILTER_MODE_CONFORMATION: + case LocalParameters::FILTER_MODE_LOOSE: + return true; + } + } + return true; + } + + bool satisfy(int covMode, int filterMode, float covThr, float TMThr, int qChainNum, int tChainNum ) { const bool covOK = Util::hasCoverage(covThr, covMode, qCov, tCov); const bool TMOK = hasTM(TMThr, covMode, filterMode); - const bool chainNumOK = hasChainNum(covMode, filterMode, qChainNum, tChainNum); + // const bool chainNumOK = hasChainNum(covMode, filterMode, qChainNum, tChainNum); const bool coordOK = hasMatchedCoord(filterMode); // const bool chainTMOK = hasChainTm(chainTMThr, covMode, alignedQChainTmScores, alignedTChainTmScores, qChainNum, tChainNum); - // const bool numOK = hasChainnum(sameChainNum, qChainNum, tChainNum); // return (covOK && TMOK && chainTMOK && numOK); - return (covOK && TMOK && chainNumOK && coordOK); + const bool octaOK = hasMatchedOcta(filterMode, covMode, qOctaCount, tOctaCount); + // #pragma omp critical + // { + // if (covOK && TMOK && coordOK){ + // if(!octaOK){ + // if (qChainKeys.size()==tChainKeys.size()){ + // Debug(Debug::WARNING)<< qChainKeys[0] << "\t"<< tChainKeys[0]<< "\n" ; + // } + // } + // } + // } + return (covOK && TMOK && coordOK && octaOK); + // return(covOK && TMOK && coordOK); } - void update(unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qChainTm, double tChainTm, bool sameCoord) { - this->qTotalAlnLen += qTotalAlnLen; + void update(unsigned int qChainKey, unsigned int tChainKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qChainTm, double tChainTm, bool sameCoord, float qAvgCoord[3]) { this->tTotalAlnLen += tTotalAlnLen; + this->qTotalAlnLen += qTotalAlnLen; this->alignedQChainTmScores.push_back(qChainTm); this->alignedTChainTmScores.push_back(tChainTm); this->sameCoord *= sameCoord; + if (sameCoord){ + refCoord[0] = qAvgCoord[0]; + refCoord[1] = qAvgCoord[1]; + refCoord[2] = qAvgCoord[2]; + } + auto pos = std::find(qChainKeys.begin(), qChainKeys.end(), qChainKey); + if (pos != qChainKeys.end()) { + qChainKeys.erase(pos); + } + pos = std::find(tChainKeys.begin(), tChainKeys.end(), tChainKey); + if (pos != tChainKeys.end()) { + tChainKeys.erase(pos); + } } void calcCov(unsigned int qLen, unsigned int tLen) { @@ -288,9 +369,9 @@ double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u } return tmscore; } -void getAverageCoord( float * qdata, float qavgCoords[3], int qlen, float * tdata, float tavgCoords[3], int tlen, float t[3], float u[3][3]){ + +void getRotatedAverageCoord(float * tdata, float tavgCoords[3], int tlen, float t[3], float u[3][3]){ Coordinates tCoords(tlen); - for (int i=0; i < tlen ; i++ ){ tCoords.x[i] = tdata[i]; tCoords.y[i] = tdata[tlen+i]; @@ -298,6 +379,22 @@ void getAverageCoord( float * qdata, float qavgCoords[3], int qlen, float * tdat } Coordinates Coordsrot(tlen); BasicFunction::do_rotation(tCoords, Coordsrot, tlen, t, u); + float txsum=0.0, tysum=0.0, tzsum=0.0; + for (int i=0; i< tlen;i++){ + txsum += Coordsrot.x[i]; + tysum += Coordsrot.y[i]; + tzsum += Coordsrot.z[i]; + } + txsum /= tlen; + tysum /= tlen; + tzsum /= tlen; + tavgCoords[0] = txsum; + tavgCoords[1] = tysum; + tavgCoords[2] = tzsum; +} + +void getAverageCoord( float * qdata, float qavgCoords[3], int qlen){ + float qxsum=0.0, qysum=0.0, qzsum=0.0; for (int i=0; i< qlen;i++){ qxsum += qdata[i]; @@ -310,19 +407,12 @@ void getAverageCoord( float * qdata, float qavgCoords[3], int qlen, float * tdat qavgCoords[0] = qxsum; qavgCoords[1] = qysum; qavgCoords[2] = qzsum; +} - float txsum=0.0, tysum=0.0, tzsum=0.0; - for (int i=0; i< qlen;i++){ - txsum += Coordsrot.x[i]; - tysum += Coordsrot.y[i]; - tzsum += Coordsrot.z[i]; - } - txsum /= tlen; - tysum /= tlen; - tzsum /= tlen; - tavgCoords[0] = txsum; - tavgCoords[1] = tysum; - tavgCoords[2] = tzsum; +void getVector(float avgCoords[3], float refavgCoords[3]){ + avgCoords[0] -= refavgCoords[0]; + avgCoords[1] -= refavgCoords[1]; + avgCoords[2] -= refavgCoords[2]; } unsigned int getComplexResidueLength( IndexReader *Dbr, std::vector &ChainKeys) { @@ -478,7 +568,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { progress.updateProgress(); unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; - std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); + std::vector qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { unsigned int qChainKey = qChainKeys[qChainIdx]; @@ -509,11 +599,14 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int assId = retComplex.assId; unsigned int tChainKey= res.dbKey; unsigned int tChainAlnId = alnDbr.getId(tChainKey); - unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); //if target is monomer, break to be singleton if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ break; } + unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); + unsigned int tComplexId = tChainKeyToComplexIdMap.at(tChainKey); + std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); + float u[3][3]; float t[3]; fillUArr(retComplex.uString, u); @@ -524,28 +617,67 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); unsigned int alnLen = cigarToAlignedLength(res.backtrace); Coordinates qm(alnLen), tm(alnLen); + //FIXME: if new chainTM not required, erase those part fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); double chainTm = computeChainTmScore(qm, tm, t, u, alnLen, res.dbLen); double qChainTm = chainTm / res.qLen; double tChainTm = chainTm/ res.dbLen; unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + float qAvgCoord[3], tAvgCoord[3]; - getAverageCoord(qdata, qAvgCoord, res.qLen, tdata, tAvgCoord, res.dbLen, t, u) ; - bool coordSame = isSameCoord(qAvgCoord, tAvgCoord ); + getRotatedAverageCoord(tdata, tAvgCoord, res.dbLen, t, u); + getAverageCoord(qdata, qAvgCoord, res.qLen) ; + bool coordSame = isSameCoord(qAvgCoord, tAvgCoord); if (localComplexMap.find(assId) == localComplexMap.end()) { - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm, coordSame); + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, qChainKeys, tChainKeys, retComplex.qTmScore, retComplex.tTmScore, t, u); localComplexMap[assId] = cmplfiltcrit; + localComplexMap.at(assId).update(qChainKey, tChainKey, qtotalaln, ttotalaln, qChainTm, tChainTm, coordSame, qAvgCoord); } else { - localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm, coordSame); + localComplexMap.at(assId).update(qChainKey, tChainKey, qtotalaln, ttotalaln, qChainTm, tChainTm, coordSame, qAvgCoord); } + } // while end } for (auto& assId_res : localComplexMap){ unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); - if (!assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr, qChainKeys.size(), tChainKeys.size())){ + if (assId_res.second.qChainKeys.size()!= 0){ + std::vector qOctaCount(8,0); + std::vector tOctaCount(8,0); + for (unsigned int qChainKey : assId_res.second.qChainKeys){ + unsigned int qChainDbId = qDbr->sequenceReader->getId(qChainKey); + char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); + size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); + int qSeqLength = qDbr->sequenceReader->getSeqLen(qChainDbId); + float* qdata = qcoords.read(qcadata, qSeqLength, qCaLength); + float qAvgCoord[3]; + getAverageCoord(qdata, qAvgCoord, qSeqLength); + int tocheck = determineOctant(qAvgCoord); + getVector(qAvgCoord, assId_res.second.refCoord); + int qOcta = determineOctant(qAvgCoord); + qOctaCount[qOcta]++; + } + for (unsigned int tChainKey : assId_res.second.tChainKeys){ + unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); + char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); + size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); + int tSeqLength = tDbr->sequenceReader->getSeqLen(tChainDbId); + float* tdata = tcoords.read(tcadata, tSeqLength, tCaLength); + float tAvgCoord[3]; + getRotatedAverageCoord(tdata, tAvgCoord, tSeqLength, assId_res.second.t, assId_res.second.u); + getVector(tAvgCoord, assId_res.second.refCoord); + int tOcta = determineOctant(tAvgCoord); + tOctaCount[tOcta]++; + } + for (int i=0;i<8;i++){ + assId_res.second.qOctaCount[i] = qOctaCount[i]; + assId_res.second.tOctaCount[i] = tOctaCount[i]; + } + } + + if (!(assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtComplexTmThr, qChainKeys.size(), tChainKeys.size()))){ assIdsToDelete.push_back(assId_res.first); } } diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp index bfc98ab7..823d9a11 100644 --- a/src/workflow/ComplexCluster.cpp +++ b/src/workflow/ComplexCluster.cpp @@ -11,7 +11,7 @@ void setComplexClusterDefaults(LocalParameters *p) { p->covThr = 0.8; p->filtComplexTmThr = 0.5; // FIX - p->filtChainTmThr=0.0; // FIX + // p->filtChainTmThr=0.0; // FIX p->filterMode=0; p->covMode = 1; p->clusteringMode = Parameters::GREEDY; From 4604c238ea5bc0e53ccc739ced65e08f81614748 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Sun, 14 Jul 2024 20:33:14 +0900 Subject: [PATCH 101/160] complex to multimer --- ...mplexcluster.sh => easymultimercluster.sh} | 0 .../{complexcluster.sh => multimercluster.sh} | 4 +- src/FoldseekBase.cpp | 44 +-- src/LocalCommandDeclarations.h | 16 +- src/commons/LocalParameters.cpp | 97 ++--- src/commons/LocalParameters.h | 32 +- src/strucclustutils/CMakeLists.txt | 10 +- src/strucclustutils/convert2pdb.cpp | 2 +- src/strucclustutils/createcomplexreport.cpp | 2 +- .../{filtercomplex.cpp => filtermultimer.cpp} | 337 +++++------------- src/strucclustutils/structureconvertalis.cpp | 2 +- src/workflow/CMakeLists.txt | 8 +- src/workflow/ComplexSearch.cpp | 8 +- src/workflow/EasyComplexSearch.cpp | 8 +- 14 files changed, 210 insertions(+), 360 deletions(-) rename data/{easycomplexcluster.sh => easymultimercluster.sh} (100%) rename data/{complexcluster.sh => multimercluster.sh} (95%) rename src/strucclustutils/{filtercomplex.cpp => filtermultimer.cpp} (65%) diff --git a/data/easycomplexcluster.sh b/data/easymultimercluster.sh similarity index 100% rename from data/easycomplexcluster.sh rename to data/easymultimercluster.sh diff --git a/data/complexcluster.sh b/data/multimercluster.sh similarity index 95% rename from data/complexcluster.sh rename to data/multimercluster.sh index 0ba0b066..d36251b6 100644 --- a/data/complexcluster.sh +++ b/data/multimercluster.sh @@ -78,8 +78,8 @@ buldCmplhDb(){ if notExists "${TMP_PATH}/complex_result.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" complexsearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complexsearch_tmp" ${COMPLEXSEARCH_PAR} \ - || fail "ComplexSearch died" + "$MMSEQS" multimersearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/multimersearch_tmp" ${COMPLEXSEARCH_PAR} \ + || fail "multimerSearch died" fi if notExists "complex_filt.dbtype"; then diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 07ffa19b..6ad9ef6c 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -254,17 +254,17 @@ std::vector foldseekCommands = { " ", CITATION_FOLDSEEK, {{"Db", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb }, {"pdbFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, - {"scorecomplex", scorecomplex, &localPar.scorecomplex, COMMAND_ALIGNMENT, + {"scoremultimer", scoremultimer, &localPar.scoremultimer, COMMAND_ALIGNMENT, "Get complex level alignments from alignmentDB", "# Get complex level alignments (chain assignments and tm-scores) from alignmentDB.\n" - "foldseek scorecomplex queryDB targetDB alignmentDB complexDB\n" + "foldseek scoremultimer queryDB targetDB alignmentDB complexDB\n" "# simple tsv output format" "foldseek createcomplexreport queryDB targetDB complexDB result.tsv" "# output files with convertalis" "foldseek convertalis queryDB targetDB complexDB result.m8\n\n", "Woosub Kim ", " ", - CITATION_FOLDSEEK, { + CITATION_FOLDSEEK_MULTIMER, { {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb}, {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb}, {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb}, @@ -276,7 +276,7 @@ std::vector foldseekCommands = { "foldseek filtercomplex queryDB targetDB alignmentDB complexDB -c 0.8 --cov-mode 1\n", "Seongeun Kim & Sooyoung Cha ", " ", - CITATION_FOLDSEEK, { + CITATION_FOLDSEEK_MULTIMER, { {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, @@ -297,7 +297,7 @@ std::vector foldseekCommands = { "# -c 0.6 + + +\n\n", "Seongeun Kim & Sooyoung Cha ", " ", - CITATION_FOLDSEEK, { + CITATION_FOLDSEEK_MULTIMER, { {"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb}, {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } @@ -319,51 +319,51 @@ std::vector foldseekCommands = { "# -c 0.6 + + +\n\n", "Seongeun Kim & Sooyoung Cha ", " ... ", - CITATION_FOLDSEEK, { + CITATION_FOLDSEEK_MULTIMER, { {"PDB|mmCIF[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &FoldSeekDbValidator::flatfileStdinAndFolder}, {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } } }, - {"complexsearch", complexsearch, &localPar.complexsearchworkflow, COMMAND_MAIN, + {"multimersearch", multimersearch, &localPar.multimersearchworkflow, COMMAND_MAIN, "Complex level search", "# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n" - "foldseek complexsearch queryDB targetDB result tmp\n" + "foldseek multimersearch queryDB targetDB result tmp\n" "# Format output differently\n" - "foldseek complexsearch queryDB targetDB result tmp --format-output query,target,qstart,tstart,cigar\n" + "foldseek multimersearch queryDB targetDB result tmp --format-output query,target,qstart,tstart,cigar\n" "# Align with TMalign (global)\n" - "foldseek complexsearch queryDB targetDB result tmp --alignment-type 1\n" + "foldseek multimersearch queryDB targetDB result tmp --alignment-type 1\n" "# Skip prefilter and perform an exhaustive alignment (slower but more sensitive)\n" - "foldseek complexsearch queryDB targetDB result tmp --exhaustive-search 1\n\n", + "foldseek multimersearch queryDB targetDB result tmp --exhaustive-search 1\n\n", "Woosub Kim ", " ", - CITATION_FOLDSEEK, { + CITATION_FOLDSEEK_MULTIMER, { {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb}, {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::NEED_HEADER, &DbValidator::sequenceDb}, {"complexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb}, {"tempDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory} } }, - {"easy-complexsearch", easycomplexsearch, &localPar.easyscomplexsearchworkflow, COMMAND_EASY, + {"easy-multimersearch", easymultimersearch, &localPar.easysmultimersearchworkflow, COMMAND_EASY, "Complex level search", "# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n" - "foldseek easy-complexsearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp\n" + "foldseek easy-multimersearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp\n" "# Format output differently\n" - "foldseek easy-complexsearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp --format-output query,target,qstart,tstart,cigar\n" + "foldseek easy-multimersearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp --format-output query,target,qstart,tstart,cigar\n" "# Align with TMalign (global)\n" - "foldseek easy-complexsearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp --alignment-type 1\n" + "foldseek easy-multimersearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp --alignment-type 1\n" "# Skip prefilter and perform an exhaustive alignment (slower but more sensitive)\n" - "foldseek easy-complexsearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp --exhaustive-search 1\n\n", + "foldseek easy-multimersearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp --exhaustive-search 1\n\n", "Woosub Kim ", " ... | | ", - CITATION_FOLDSEEK, { + CITATION_FOLDSEEK_MULTIMER, { {"PDB|mmCIF[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &FoldSeekDbValidator::flatfileStdinAndFolder}, {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &FoldSeekDbValidator::flatfileAndFolder}, {"outputFileName", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}, {"tempDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory} } }, - {"createcomplexreport", createcomplexreport, &localPar.createcomplexreport, COMMAND_FORMAT_CONVERSION, + {"createmultimerreport", createcomplexreport, &localPar.createcomplexreport, COMMAND_FORMAT_CONVERSION, "Convert complexDB to tsv format", "# Create output in tsv format (9 columns): qComplexName.c_str(), tComplexName.c_str(), qChainString.c_str(), tChainString.c_str(), qTMScore, tTMScore, u, t, assId\n" "# (1,2) identifiers for query and target complex,\n" @@ -374,14 +374,14 @@ std::vector foldseekCommands = { "foldseek createcomplexreport queryDB targetDB complexDB result.tsv\n", "Woosub Kim ", " ", - CITATION_FOLDSEEK, { + CITATION_FOLDSEEK_MULTIMER, { {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb }, {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb }, {"complexDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, {"complexFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile} } }, - {"expandcomplex", expandcomplex, &localPar.expandcomplex, COMMAND_PREFILTER, + {"expandmultimer", expandmultimer, &localPar.expandmultimer, COMMAND_PREFILTER, "Re-prefilter to ensure complete alignment between complexes", NULL, "Woosub Kim ", @@ -398,7 +398,7 @@ std::vector foldseekCommands = { NULL, "", "", - CITATION_FOLDSEEK, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}} + CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}} }; std::vector externalThreshold = { {Parameters::DBTYPE_AMINO_ACIDS, 7, 197.0, 11.22}}; diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index 2f6c00fd..6523b69b 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -20,12 +20,12 @@ extern int structureeasyrbh(int argc, const char** argv, const Command &command) extern int structureungappedalign(int argc, const char** argv, const Command &command); extern int convert2pdb(int argc, const char** argv, const Command &command); extern int compressca(int argc, const char** argv, const Command &command); -extern int scorecomplex(int argc, const char **argv, const Command& command); -extern int filtercomplex(int argc, const char **argv, const Command& command); -extern int easycomplexcluster(int argc, const char** argv, const Command &command); -extern int complexcluster(int argc, const char** argv, const Command &command); -extern int easycomplexsearch(int argc, const char **argv, const Command &command); -extern int createcomplexreport(int argc, const char **argv, const Command &command); -extern int expandcomplex(int argc, const char **argv, const Command &command); -extern int complexsearch(int argc, const char **argv, const Command &command); +extern int scoremultimer(int argc, const char **argv, const Command& command); +extern int filtermultimer(int argc, const char **argv, const Command& command); +extern int easymultimerluster(int argc, const char** argv, const Command &command); +extern int multimerluster(int argc, const char** argv, const Command &command); +extern int easymultimersearch(int argc, const char **argv, const Command &command); +extern int createmultimerreport(int argc, const char **argv, const Command &command); +extern int expandmultimer(int argc, const char **argv, const Command &command); +extern int multimersearch(int argc, const char **argv, const Command &command); #endif diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 1ef9a4b4..9d5d4bb2 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -27,11 +27,12 @@ LocalParameters::LocalParameters() : PARAM_FILE_INCLUDE(PARAM_FILE_INCLUDE_ID, "--file-include", "File Inclusion Regex", "Include file names based on this regex", typeid(std::string), (void *) &fileInclude, "^.*$"), PARAM_FILE_EXCLUDE(PARAM_FILE_EXCLUDE_ID, "--file-exclude", "File Exclusion Regex", "Exclude file names based on this regex", typeid(std::string), (void *) &fileExclude, "^.*$"), PARAM_INDEX_EXCLUDE(PARAM_INDEX_EXCLUDE_ID, "--index-exclude", "Index Exclusion", "Exclude parts of the index:\n0: Full index\n1: Exclude k-mer index (for use with --prefilter-mode 1)\n2: Exclude C-alpha coordinates (for use with --sort-by-structure-bits 0)\nFlags can be combined bit wise", typeid(int), (void *) &indexExclude, "^[0-3]{1}$", MMseqsParameter::COMMAND_EXPERT), - PARAM_COMPLEX_REPORT_MODE(PARAM_COMPLEX_REPORT_MODE_ID, "--complex-report-mode", "Complex report mode", "Complex report mode:\n0: No report\n1: Write complex report", typeid(int), (void *) &complexReportMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), - PARAM_EXPAND_COMPLEX_EVALUE(PARAM_EXPAND_COMPLEX_EVALUE_ID, "--expand-complex-evalue", "E-value threshold for expandcomplex", "E-value threshold for expandcomplex (range 0.0-inf)", typeid(double), (void *) &eValueThrExpandComplex, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN), + PARAM_MULTIMER_REPORT_MODE(PARAM_MULTIMER_REPORT_MODE_ID, "--multimer-report-mode", "Complex report mode", "Complex report mode:\n0: No report\n1: Write complex report", typeid(int), (void *) &multimerReportMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), + PARAM_MULTIMER_REPORT_MODE(PARAM_MULTIMER_REPORT_MODE_ID, "--multimer-report-mode", "Complex report mode", "Complex report mode:\n0: No report\n1: Write complex report", typeid(int), (void *) &multimerReportMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"), PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), - PARAM_COMPLEX_TM_THRESHOLD(PARAM_COMPLEX_TM_THRESHOLD_ID,"--complex-tm-threshold", "TMscore threshold for filtercomplex", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtComplexTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), + PARAM_MULTIMER_TM_THRESHOLD(PARAM_MULTIMER_TM_THRESHOLD_ID,"--multimer-tm-threshold", "TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtMultimerTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), + PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "chain TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), PARAM_FILTER_MODE(PARAM_FILTER_MODE_ID, "--filter-mode", "Filter mode", "0: Interface\n1: Conformation\n2: loose", typeid(int), (void *) &filterMode, "[0-2]{0}$", MMseqsParameter::COMMAND_CLUST) { @@ -176,42 +177,43 @@ LocalParameters::LocalParameters() : compressca.push_back(&PARAM_THREADS); compressca.push_back(&PARAM_V); - //scorecomplex - scorecomplex.push_back(&PARAM_THREADS); - scorecomplex.push_back(&PARAM_V); - scorecomplex.push_back(&PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD); - - //filtercomplex - filtercomplex.push_back(&PARAM_V); - filtercomplex.push_back(&PARAM_THREADS); - filtercomplex.push_back(&PARAM_C); - filtercomplex.push_back(&PARAM_COV_MODE); - filtercomplex.push_back(&PARAM_COMPLEX_TM_THRESHOLD); - filtercomplex.push_back(&PARAM_FILTER_MODE); - - // createcomplexreport - createcomplexreport.push_back(&PARAM_DB_OUTPUT); - createcomplexreport.push_back(&PARAM_THREADS); - createcomplexreport.push_back(&PARAM_V); - - // complexsearchworkflow - complexsearchworkflow = combineList(structuresearchworkflow, scorecomplex); - complexsearchworkflow.push_back(&PARAM_EXPAND_COMPLEX_EVALUE); - - // easycomplexsearchworkflow - easyscomplexsearchworkflow = combineList(structurecreatedb, complexsearchworkflow); - easyscomplexsearchworkflow = combineList(easyscomplexsearchworkflow, convertalignments); - easyscomplexsearchworkflow = combineList(easyscomplexsearchworkflow, createcomplexreport); - easyscomplexsearchworkflow = combineList(easyscomplexsearchworkflow, createsubdb); - easyscomplexsearchworkflow.push_back(&PARAM_COMPLEX_REPORT_MODE); - - // complexclusterworkflow - complexclusterworkflow = combineList(complexsearchworkflow, filtercomplex); - complexclusterworkflow = combineList(complexclusterworkflow, clust); - - //easycomplexclusterworkflow - easycomplexclusterworkflow = combineList(structurecreatedb, complexclusterworkflow); - easycomplexclusterworkflow = combineList(easycomplexclusterworkflow, result2repseq); + //scorecmultimer + scoremultimer.push_back(&PARAM_THREADS); + scoremultimer.push_back(&PARAM_V); + scoremultimer.push_back(&PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD); + + //filtermultimer + filtermultimer.push_back(&PARAM_V); + filtermultimer.push_back(&PARAM_THREADS); + filtermultimer.push_back(&PARAM_C); + filtermultimer.push_back(&PARAM_COV_MODE); + filtermultimer.push_back(&PARAM_COMPLEX_TM_THRESHOLD); + filtermultimer.push_back(&PARAM_CHAIN_TM_THRESHOLD); + filtermultimer.push_back(&PARAM_FILTER_MODE); + + // createmultimerreport + createmultimerreport.push_back(&PARAM_DB_OUTPUT); + createmultimerreport.push_back(&PARAM_THREADS); + createmultimerreport.push_back(&PARAM_V); + + // multimersearchworkflow + multimersearchworkflow = combineList(structuresearchworkflow, scoremultimer); + multimersearchworkflow.push_back(&PARAM_EXPAND_COMPLEX_EVALUE); + + // easysmultimersearchworkflow + easysmultimersearchworkflow = combineList(structurecreatedb, multimersearchworkflow); + easysmultimersearchworkflow = combineList(easysmultimersearchworkflow, convertalignments); + easysmultimersearchworkflow = combineList(easysmultimersearchworkflow, createmultimerreport); + easysmultimersearchworkflow = combineList(easysmultimersearchworkflow, createsubdb); + easysmultimersearchworkflow.push_back(&PARAM_COMPLEX_REPORT_MODE); + + // multimerclusterworkflow + multimerclusterworkflow = combineList(multimersearchworkflow, filtermultimer); + multimerclusterworkflow = combineList(multimerclusterworkflow, clust); + + //easymultimerlusterworkflow + easymultimerlusterworkflow = combineList(structurecreatedb, multimerclusterworkflow); + easymultimerlusterworkflow = combineList(easymultimerlusterworkflow, result2repseq); // expandcomplex expandcomplex.push_back(&PARAM_THREADS); @@ -246,10 +248,11 @@ LocalParameters::LocalParameters() : fileExclude = "^$"; dbSuffixList = "_h,_ss,_ca"; indexExclude = 0; - complexReportMode = 1; - eValueThrExpandComplex = 10000.0; + multimerReportMode = 1; + eValueThrExpandMultimer = 10000.0; citations.emplace(CITATION_FOLDSEEK, "van Kempen, M., Kim, S.S., Tumescheit, C., Mirdita, M., Lee, J., Gilchrist, C.L.M., Söding, J., and Steinegger, M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)"); - filtComplexTmThr = 0.0; + filtMultimerTmThr = 0.0; + filtChainTmThr = 0.0; filterMode = 0; //rewrite param vals. @@ -317,11 +320,11 @@ std::vector LocalParameters::getOutputFormat(int formatMode, const std::str else if (outformatSplit[i].compare("lddtfull") == 0) { needQCa = true; needTCa = true; needLDDT = true; needBacktrace = true; code = LocalParameters::OUTFMT_LDDT_FULL; } else if (outformatSplit[i].compare("prob") == 0) { needQCa = true; needTCa = true; needLDDT = true; needBacktrace = true; needTMaligner = true; code = LocalParameters::OUTFMT_PROBTP; } // TODO - else if (outformatSplit[i].compare("complexqtmscore")==0){code=LocalParameters::OUTFMT_Q_COMPLEX_TMSCORE; } - else if (outformatSplit[i].compare("complexttmscore")==0){code=LocalParameters::OUTFMT_T_COMPLEX_TMSCORE;} - else if (outformatSplit[i].compare("complexassignid")==0){code=LocalParameters::OUTFMT_ASSIGN_ID;} - else if (outformatSplit[i].compare("complexu")==0){code=LocalParameters::OUTFMT_COMPLEX_U;} - else if (outformatSplit[i].compare("complext")==0){code=LocalParameters::OUTFMT_COMPLEX_T;} + else if (outformatSplit[i].compare("complexqtmscore")==0 || outformatSplit[i].compare("multimerqtmscore")==0) {code=LocalParameters::OUTFMT_Q_COMPLEX_TMSCORE; } + else if (outformatSplit[i].compare("complexttmscore")==0 || outformatSplit[i].compare("multimerttmscore")==0){code=LocalParameters::OUTFMT_T_COMPLEX_TMSCORE;} + else if (outformatSplit[i].compare("complexassignid")==0 || outformatSplit[i].compare("multimerassignid")==0){code=LocalParameters::OUTFMT_ASSIGN_ID;} + else if (outformatSplit[i].compare("complexu")==0 || outformatSplit[i].compare("multimeru")==0){code=LocalParameters::OUTFMT_COMPLEX_U;} + else if (outformatSplit[i].compare("complext")==0 || outformatSplit[i].compare("multimert")==0){code=LocalParameters::OUTFMT_COMPLEX_T;} else { Debug(Debug::ERROR) << "Format code " << outformatSplit[i] << " does not exist."; EXIT(EXIT_FAILURE); diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index a2679852..f50ed27f 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -54,7 +54,7 @@ class LocalParameters : public Parameters { static const int OUTFMT_PROBTP = 48; static const int OUTFMT_QTMSCORE = 49; static const int OUTFMT_TTMSCORE = 50; - // for scorecomplex + // for Foldseek-MM static const int OUTFMT_QUERY_COMPLEX = 51; static const int OUTFMT_TARGET_COMPLEX = 52; static const int OUTFMT_Q_COMPLEX_TMSCORE = 53; @@ -98,14 +98,14 @@ class LocalParameters : public Parameters { std::vector easystructureclusterworkflow; std::vector structurecreatedb; std::vector compressca; - std::vector scorecomplex; - std::vector filtercomplex; - std::vector complexclusterworkflow; - std::vector easycomplexclusterworkflow; - std::vector complexsearchworkflow; - std::vector easyscomplexsearchworkflow; - std::vector createcomplexreport; - std::vector expandcomplex; + std::vector scoremultimer; + std::vector filtermultimer; + std::vector multimerclusterworkflow; + std::vector easymultimerlusterworkflow; + std::vector multimersearchworkflow; + std::vector easysmultimersearchworkflow; + std::vector createmultimerreport; + std::vector expandmultimer; std::vector convert2pdb; PARAMETER(PARAM_PREF_MODE) @@ -126,11 +126,12 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_FILE_INCLUDE) PARAMETER(PARAM_FILE_EXCLUDE) PARAMETER(PARAM_INDEX_EXCLUDE) - PARAMETER(PARAM_COMPLEX_REPORT_MODE) - PARAMETER(PARAM_EXPAND_COMPLEX_EVALUE) + PARAMETER(PARAM_MULTIMER_REPORT_MODE) + PARAMETER(PARAM_EXPAND_MULTIMER_EVALUE) PARAMETER(PARAM_INPUT_FORMAT) PARAMETER(PARAM_PDB_OUTPUT_MODE) - PARAMETER(PARAM_COMPLEX_TM_THRESHOLD) + PARAMETER(PARAM_MULTIMER_TM_THRESHOLD) + PARAMETER(PARAM_CHAIN_TM_THRESHOLD) PARAMETER(PARAM_FILTER_MODE) int prefMode; @@ -151,11 +152,12 @@ class LocalParameters : public Parameters { std::string fileInclude; std::string fileExclude; int indexExclude; - int complexReportMode; - double eValueThrExpandComplex; + int multimerReportMode; + double eValueThrExpandMultimer; int inputFormat; int pdbOutputMode; - float filtComplexTmThr; + float filtMultimerTmThr; + float filtChainTmThr; int filterMode; static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, diff --git a/src/strucclustutils/CMakeLists.txt b/src/strucclustutils/CMakeLists.txt index 992f91b7..cf641390 100644 --- a/src/strucclustutils/CMakeLists.txt +++ b/src/strucclustutils/CMakeLists.txt @@ -13,11 +13,11 @@ set(strucclustutils_source_files strucclustutils/structurerescorediagonal.cpp strucclustutils/convert2pdb.cpp strucclustutils/compressca.cpp - strucclustutils/scorecomplex.cpp - strucclustutils/filtercomplex.cpp - strucclustutils/createcomplexreport.cpp - strucclustutils/createcomplexreport.h - strucclustutils/expandcomplex.cpp + strucclustutils/scoremultimer.cpp + strucclustutils/filtermultimer.cpp + strucclustutils/createmultimereport.cpp + strucclustutils/createcmultimerreport.h + strucclustutils/expandmultimer.cpp PARENT_SCOPE ) diff --git a/src/strucclustutils/convert2pdb.cpp b/src/strucclustutils/convert2pdb.cpp index 64750581..a6d3d57a 100644 --- a/src/strucclustutils/convert2pdb.cpp +++ b/src/strucclustutils/convert2pdb.cpp @@ -7,7 +7,7 @@ #include "Util.h" #include "FileUtil.h" #include "Coordinate16.h" -#include "createcomplexreport.h" +#include "MultimerUtil.h" #ifdef OPENMP #include diff --git a/src/strucclustutils/createcomplexreport.cpp b/src/strucclustutils/createcomplexreport.cpp index 0346ad00..c965df35 100644 --- a/src/strucclustutils/createcomplexreport.cpp +++ b/src/strucclustutils/createcomplexreport.cpp @@ -75,7 +75,7 @@ struct ComplexAlignment { unsigned int assId; }; -int createcomplexreport(int argc, const char **argv, const Command &command) { +int createmultimerreport(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.parseParameters(argc, argv, command, true, 0, 0); const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; diff --git a/src/strucclustutils/filtercomplex.cpp b/src/strucclustutils/filtermultimer.cpp similarity index 65% rename from src/strucclustutils/filtercomplex.cpp rename to src/strucclustutils/filtermultimer.cpp index f060cff8..a1200209 100644 --- a/src/strucclustutils/filtercomplex.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -35,63 +35,6 @@ unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { } } -// bool hasChainTm(float chainTMThr, int covMode, std::vector &qChainTmScores, std::vector &tChainTmScores, unsigned int qChainNum, unsigned int tChainNum) { -// if (chainTMThr > 0 ){ -// switch (covMode) { -// case Parameters::COV_MODE_BIDIRECTIONAL: -// if (qChainTmScores.size()= 0 && coord[1] >= 0 && coord[2] >= 0) return 0; - if (coord[0] < 0 && coord[1] >= 0 && coord[2] >= 0) return 1; - if (coord[0] < 0 && coord[1] < 0 && coord[2] >= 0) return 2; - if (coord[0] >= 0 && coord[1] < 0 && coord[2] >= 0) return 3; - if (coord[0] >= 0 && coord[1] >= 0 && coord[2] < 0) return 4; - if (coord[0] < 0 && coord[1] >= 0 && coord[2] < 0) return 5; - if (coord[0] < 0 && coord[1] < 0 && coord[2] < 0) return 6; - if (coord[0] >= 0 && coord[1] < 0 && coord[2] < 0) return 7; -} - -static bool isSameCoord(float qavgCoords[3], float tavgCoords[3]){ - return(determineOctant(qavgCoords) == determineOctant(tavgCoords)) ; -} - class ComplexFilterCriteria { public: unsigned int dbKey; @@ -101,14 +44,10 @@ class ComplexFilterCriteria { float tCov; double qTM; double tTM; - bool sameCoord; - std::vector qOctaCount; - std::vector tOctaCount; std::vector alignedQChainTmScores; std::vector alignedTChainTmScores; std::vector qChainKeys; std::vector tChainKeys; - float refCoord[3]; float t[3]; float u[3][3]; @@ -120,11 +59,6 @@ class ComplexFilterCriteria { for (int i = 0; i < 3; i++) { std::copy(ustring[i], ustring[i] + 3, u[i]); } - for (int i = 0; i < 8; i++) { - qOctaCount.push_back(0); - tOctaCount.push_back(0); - } - sameCoord = 1; } ~ComplexFilterCriteria() { alignedQChainTmScores.clear(); @@ -147,121 +81,116 @@ class ComplexFilterCriteria { case Parameters::COV_MODE_LENGTH_SHORTER : return true; } - // case LocalParameters::FILTER_MODE_CONFORMATION: - //TODO - //For 1: maybe, check the minimum chain tmscore among all chain-chain(not new) tm + case LocalParameters::FILTER_MODE_CONFORMATION: + return true; } } - // bool hasChainNum(int covMode, int filterMode, int qChainNum, int tChainNum ){ - // switch (filterMode){ - // case LocalParameters::FILTER_MODE_INTERFACE: - // switch (covMode) { - // case Parameters::COV_MODE_BIDIRECTIONAL: - // return (alignedQChainTmScores.size()==qChainNum && qChainNum==tChainNum); - // case Parameters::COV_MODE_TARGET: - // return (alignedTChainTmScores.size()==tChainNum); - // case Parameters::COV_MODE_QUERY: - // return (alignedQChainTmScores.size()==qChainNum); - // case Parameters::COV_MODE_LENGTH_QUERY : - // case Parameters::COV_MODE_LENGTH_TARGET : - // case Parameters::COV_MODE_LENGTH_SHORTER : - // return true; - // } - // case LocalParameters::FILTER_MODE_CONFORMATION: - // switch (covMode) { - // case Parameters::COV_MODE_BIDIRECTIONAL: - // return (qChainNum==tChainNum); - // default: - // return true; - // } - // case LocalParameters::FILTER_MODE_LOOSE: - // return true; - - // } - // } - - bool hasMatchedCoord(int filterMode){ - switch (filterMode) { + bool hasChainNum(int covMode, int filterMode, int qChainNum, int tChainNum ){ + switch (filterMode){ case LocalParameters::FILTER_MODE_INTERFACE: - return (sameCoord == 1); + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return (alignedQChainTmScores.size()==qChainNum && qChainNum==tChainNum); + case Parameters::COV_MODE_TARGET: + return (alignedTChainTmScores.size()==tChainNum); + case Parameters::COV_MODE_QUERY: + return (alignedQChainTmScores.size()==qChainNum); + case Parameters::COV_MODE_LENGTH_QUERY : + case Parameters::COV_MODE_LENGTH_TARGET : + case Parameters::COV_MODE_LENGTH_SHORTER : + return true; + } case LocalParameters::FILTER_MODE_CONFORMATION: + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return (qChainNum==tChainNum); + case Parameters::COV_MODE_TARGET: + return (qChainNum>=tChainNum); + case Parameters::COV_MODE_QUERY: + return (qChainNum<=tChainNum); + case Parameters::COV_MODE_LENGTH_QUERY : + case Parameters::COV_MODE_LENGTH_TARGET : + case Parameters::COV_MODE_LENGTH_SHORTER : + return true; + } case LocalParameters::FILTER_MODE_LOOSE: return true; + } - } + } - bool hasMatchedOcta(int filterMode, int covMode, std::vector &qOctaCount, std::vector &tOctaCount){ - if (qChainKeys.size()>= 1){ - switch (filterMode) { - case LocalParameters::FILTER_MODE_INTERFACE: - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - for (size_t i = 0; i < 8; i++) { - if (qOctaCount[i] != tOctaCount[i]) { - return false; - } + + bool hasChainTm(float chainTMThr, int covMode, int filterMode, unsigned int qChainNum, unsigned int tChainNum) { + switch (filterMode){ + case LocalParameters::FILTER_MODE_INTERFACE: + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + if (alignedQChainTmScores.size() tOctaCount[i]) { - return false; - } + } + break; + case Parameters::COV_MODE_QUERY: + if (alignedQChainTmScores.size()tTotalAlnLen += tTotalAlnLen; this->qTotalAlnLen += qTotalAlnLen; this->alignedQChainTmScores.push_back(qChainTm); this->alignedTChainTmScores.push_back(tChainTm); - this->sameCoord *= sameCoord; - if (sameCoord){ - refCoord[0] = qAvgCoord[0]; - refCoord[1] = qAvgCoord[1]; - refCoord[2] = qAvgCoord[2]; - } auto pos = std::find(qChainKeys.begin(), qChainKeys.end(), qChainKey); if (pos != qChainKeys.end()) { qChainKeys.erase(pos); @@ -370,51 +299,6 @@ double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u return tmscore; } -void getRotatedAverageCoord(float * tdata, float tavgCoords[3], int tlen, float t[3], float u[3][3]){ - Coordinates tCoords(tlen); - for (int i=0; i < tlen ; i++ ){ - tCoords.x[i] = tdata[i]; - tCoords.y[i] = tdata[tlen+i]; - tCoords.z[i] = tdata[2*tlen+i]; - } - Coordinates Coordsrot(tlen); - BasicFunction::do_rotation(tCoords, Coordsrot, tlen, t, u); - float txsum=0.0, tysum=0.0, tzsum=0.0; - for (int i=0; i< tlen;i++){ - txsum += Coordsrot.x[i]; - tysum += Coordsrot.y[i]; - tzsum += Coordsrot.z[i]; - } - txsum /= tlen; - tysum /= tlen; - tzsum /= tlen; - tavgCoords[0] = txsum; - tavgCoords[1] = tysum; - tavgCoords[2] = tzsum; -} - -void getAverageCoord( float * qdata, float qavgCoords[3], int qlen){ - - float qxsum=0.0, qysum=0.0, qzsum=0.0; - for (int i=0; i< qlen;i++){ - qxsum += qdata[i]; - qysum += qdata[qlen+i]; - qzsum += qdata[2*qlen+i]; - } - qxsum /= qlen; - qysum /= qlen; - qzsum /= qlen; - qavgCoords[0] = qxsum; - qavgCoords[1] = qysum; - qavgCoords[2] = qzsum; -} - -void getVector(float avgCoords[3], float refavgCoords[3]){ - avgCoords[0] -= refavgCoords[0]; - avgCoords[1] -= refavgCoords[1]; - avgCoords[2] -= refavgCoords[2]; -} - unsigned int getComplexResidueLength( IndexReader *Dbr, std::vector &ChainKeys) { unsigned int ResidueLen = 0; for (auto ChainKey: ChainKeys) { @@ -624,60 +508,21 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t double tChainTm = chainTm/ res.dbLen; unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - - float qAvgCoord[3], tAvgCoord[3]; - getRotatedAverageCoord(tdata, tAvgCoord, res.dbLen, t, u); - getAverageCoord(qdata, qAvgCoord, res.qLen) ; - bool coordSame = isSameCoord(qAvgCoord, tAvgCoord); if (localComplexMap.find(assId) == localComplexMap.end()) { ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, qChainKeys, tChainKeys, retComplex.qTmScore, retComplex.tTmScore, t, u); localComplexMap[assId] = cmplfiltcrit; - localComplexMap.at(assId).update(qChainKey, tChainKey, qtotalaln, ttotalaln, qChainTm, tChainTm, coordSame, qAvgCoord); + localComplexMap.at(assId).update(qChainKey, tChainKey, qtotalaln, ttotalaln, qChainTm, tChainTm); } else { - localComplexMap.at(assId).update(qChainKey, tChainKey, qtotalaln, ttotalaln, qChainTm, tChainTm, coordSame, qAvgCoord); + localComplexMap.at(assId).update(qChainKey, tChainKey, qtotalaln, ttotalaln, qChainTm, tChainTm); } } // while end } for (auto& assId_res : localComplexMap){ unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); - assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); - if (assId_res.second.qChainKeys.size()!= 0){ - std::vector qOctaCount(8,0); - std::vector tOctaCount(8,0); - for (unsigned int qChainKey : assId_res.second.qChainKeys){ - unsigned int qChainDbId = qDbr->sequenceReader->getId(qChainKey); - char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); - size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); - int qSeqLength = qDbr->sequenceReader->getSeqLen(qChainDbId); - float* qdata = qcoords.read(qcadata, qSeqLength, qCaLength); - float qAvgCoord[3]; - getAverageCoord(qdata, qAvgCoord, qSeqLength); - int tocheck = determineOctant(qAvgCoord); - getVector(qAvgCoord, assId_res.second.refCoord); - int qOcta = determineOctant(qAvgCoord); - qOctaCount[qOcta]++; - } - for (unsigned int tChainKey : assId_res.second.tChainKeys){ - unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); - char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); - size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); - int tSeqLength = tDbr->sequenceReader->getSeqLen(tChainDbId); - float* tdata = tcoords.read(tcadata, tSeqLength, tCaLength); - float tAvgCoord[3]; - getRotatedAverageCoord(tdata, tAvgCoord, tSeqLength, assId_res.second.t, assId_res.second.u); - getVector(tAvgCoord, assId_res.second.refCoord); - int tOcta = determineOctant(tAvgCoord); - tOctaCount[tOcta]++; - } - for (int i=0;i<8;i++){ - assId_res.second.qOctaCount[i] = qOctaCount[i]; - assId_res.second.tOctaCount[i] = tOctaCount[i]; - } - } - - if (!(assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtComplexTmThr, qChainKeys.size(), tChainKeys.size()))){ + assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); + if (!(assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr, qChainKeys.size(), tChainKeys.size()))){ assIdsToDelete.push_back(assId_res.first); } } diff --git a/src/strucclustutils/structureconvertalis.cpp b/src/strucclustutils/structureconvertalis.cpp index c07c83b9..6c0e8dae 100644 --- a/src/strucclustutils/structureconvertalis.cpp +++ b/src/strucclustutils/structureconvertalis.cpp @@ -13,7 +13,7 @@ #include "NcbiTaxonomy.h" #include "MappingReader.h" #include "Coordinate16.h" -#include "createcomplexreport.h" +#include "MultimerUtil.h" #define ZSTD_STATIC_LINKING_ONLY diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt index 8d551f79..4fe13d55 100644 --- a/src/workflow/CMakeLists.txt +++ b/src/workflow/CMakeLists.txt @@ -6,9 +6,9 @@ set(workflow_source_files workflow/EasyStructureRbh.cpp workflow/EasyStructureSearch.cpp workflow/EasyStructureCluster.cpp - workflow/EasyComplexSearch.cpp - workflow/ComplexSearch.cpp - workflow/ComplexCluster.cpp - workflow/EasyComplexCluster.cpp + workflow/EasyMultimerSearch.cpp + workflow/Multimerearch.cpp + workflow/MultimerCluster.cpp + workflow/EasyMultimerCluster.cpp PARENT_SCOPE ) diff --git a/src/workflow/ComplexSearch.cpp b/src/workflow/ComplexSearch.cpp index 44a197e6..b326e488 100644 --- a/src/workflow/ComplexSearch.cpp +++ b/src/workflow/ComplexSearch.cpp @@ -6,10 +6,10 @@ #include "Util.h" #include "Debug.h" -#include "complexsearch.sh.h" +#include "multimersearch.sh.h" -int complexsearch(int argc, const char **argv, const Command &command) { +int multimersearch(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); @@ -115,8 +115,8 @@ int complexsearch(int argc, const char **argv, const Command &command) { cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str()); - std::string program = tmpDir + "/complexsearch.sh"; - FileUtil::writeFile(program, complexsearch_sh, complexsearch_sh_len); + std::string program = tmpDir + "/multimersearch.sh"; + FileUtil::writeFile(program, multimersearch_sh, multimersearch_sh_len); cmd.execProgram(program.c_str(), par.filenames); // Should never get here assert(false); diff --git a/src/workflow/EasyComplexSearch.cpp b/src/workflow/EasyComplexSearch.cpp index d9f5c335..273d1178 100644 --- a/src/workflow/EasyComplexSearch.cpp +++ b/src/workflow/EasyComplexSearch.cpp @@ -6,9 +6,9 @@ #include "Util.h" #include "Debug.h" -#include "easycomplexsearch.sh.h" +#include "easymultimersearch.sh.h" -int easycomplexsearch(int argc, const char **argv, const Command &command) { +int easymultimersearch(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); @@ -124,8 +124,8 @@ int easycomplexsearch(int argc, const char **argv, const Command &command) { cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str()); - std::string program = tmpDir + "/easycomplexsearch.sh"; - FileUtil::writeFile(program, easycomplexsearch_sh, easycomplexsearch_sh_len); + std::string program = tmpDir + "/easymultimersearch.sh"; + FileUtil::writeFile(program, easymultimersearch_sh, easymultimersearch_sh_len); cmd.execProgram(program.c_str(), par.filenames); // Should never get here assert(false); From c5f59d20e3309be5cb8a862f87481ab9d600e3e3 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Sun, 14 Jul 2024 20:56:54 +0900 Subject: [PATCH 102/160] Merge branch 'steineggerlab:master' --- data/CMakeLists.txt | 9 +--- src/FoldseekBase.cpp | 58 ++++---------------- src/LocalCommandDeclarations.h | 3 -- src/commons/LocalParameters.cpp | 73 ++++++-------------------- src/commons/LocalParameters.h | 23 +++----- src/strucclustutils/CMakeLists.txt | 5 -- src/strucclustutils/structcreatedb.cpp | 13 ----- src/workflow/CMakeLists.txt | 6 +-- src/workflow/EasyMultimerSearch.cpp | 5 -- src/workflow/MultimerSearch.cpp | 8 --- 10 files changed, 33 insertions(+), 170 deletions(-) diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index f84eba81..866fc66d 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -13,15 +13,10 @@ set(COMPILED_RESOURCES evalue_nn.kerasify main.js vendor.js.zst -<<<<<<< HEAD - complexsearch.sh - easycomplexsearch.sh - complexcluster.sh - easycomplexcluster.sh -======= multimersearch.sh easymultimersearch.sh ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 + multimerluster.sh + easymultimercluster.sh ) set(GENERATED_OUTPUT_HEADERS "") diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 139a8d86..9837ed8c 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -274,10 +274,9 @@ std::vector foldseekCommands = { {"complexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb} } }, -<<<<<<< HEAD - {"filtercomplex", filtercomplex, &localPar.filtercomplex, COMMAND_HIDDEN, - "Filters complexes satisfying given coverage", - "foldseek filtercomplex queryDB targetDB alignmentDB complexDB -c 0.8 --cov-mode 1\n", + {"filtermultimer", filtermultimer, &localPar.filtermultimer, COMMAND_HIDDEN, + "Filters multimers satisfying given coverage", + "foldseek filtermultimer queryDB targetDB alignmentDB complexDB -c 0.8 --cov-mode 1\n", "Seongeun Kim & Sooyoung Cha ", " ", CITATION_FOLDSEEK_MULTIMER, { @@ -289,10 +288,10 @@ std::vector foldseekCommands = { {"tmptsv", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::flatfile } } }, - {"complexcluster", complexcluster, &localPar.complexclusterworkflow, COMMAND_MAIN, - "Complex level cluster", + {"multimercluster", multimercluster, &localPar.multimerclusterworkflow, COMMAND_MAIN, + "Multimer level cluster", "#Clustering of PDB DB\n" - "foldseek complexcluster queryDB clusterDB tmp\n" + "foldseek multimercluster queryDB clusterDB tmp\n" "# --cov-mode \n" "# Sequence 0 1 2\n" "# Q: MAVGTACRPA 60% IGN 60%\n" @@ -307,10 +306,10 @@ std::vector foldseekCommands = { {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } } }, - {"easy-complexcluster", easycomplexcluster, &localPar.easycomplexclusterworkflow, COMMAND_EASY, - "Complex level cluster", + {"easy-multimercluster", easymultimercluster, &localPar.easymultimerclusterworkflow, COMMAND_EASY, + "Multimer level cluster", "#Clustering of PDB files\n" - "foldseek easy-complexcluster examples/ result tmp\n" + "foldseek easy-multimercluster examples/ result tmp\n" "# Cluster output\n" "# - result_rep_seq.fasta: Representatives\n" "# - result_cluster.tsv: Adjacency list\n\n" @@ -329,11 +328,6 @@ std::vector foldseekCommands = { {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory } } }, -======= - {"scorecomplex", scoremultimer, &localPar.scoremultimer, COMMAND_HIDDEN, - "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} - }, ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 {"multimersearch", multimersearch, &localPar.multimersearchworkflow, COMMAND_MAIN, "Complex level search", "# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n" @@ -353,14 +347,7 @@ std::vector foldseekCommands = { {"tempDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory} } }, -<<<<<<< HEAD {"easy-multimersearch", easymultimersearch, &localPar.easysmultimersearchworkflow, COMMAND_EASY, -======= - {"complexsearch", multimersearch, &localPar.multimersearchworkflow, COMMAND_HIDDEN, - "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} - }, - {"easy-multimersearch", easymultimersearch, &localPar.easymultimersearchworkflow, COMMAND_EASY, ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 "Complex level search", "# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n" "foldseek easy-multimersearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp\n" @@ -379,14 +366,7 @@ std::vector foldseekCommands = { {"tempDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory} } }, -<<<<<<< HEAD {"createmultimerreport", createcomplexreport, &localPar.createcomplexreport, COMMAND_FORMAT_CONVERSION, -======= - {"easy-complexsearch", easymultimersearch, &localPar.easymultimersearchworkflow, COMMAND_EASY, - "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} - }, - {"createmultimerreport", createmultimerreport, &localPar.createmultimerreport, COMMAND_FORMAT_CONVERSION, ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 "Convert complexDB to tsv format", "# Create output in tsv format (9 columns): qComplexName.c_str(), tComplexName.c_str(), qChainString.c_str(), tChainString.c_str(), qTMScore, tTMScore, u, t, assId\n" "# (1,2) identifiers for query and target complex,\n" @@ -404,7 +384,6 @@ std::vector foldseekCommands = { {"complexFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile} } }, -<<<<<<< HEAD {"expandmultimer", expandmultimer, &localPar.expandmultimer, COMMAND_PREFILTER, "Re-prefilter to ensure complete alignment between complexes", NULL, @@ -416,25 +395,6 @@ std::vector foldseekCommands = { {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::prefilterDb } } -======= - {"createcomplexreport", createmultimerreport, &localPar.createmultimerreport, COMMAND_FORMAT_CONVERSION, - "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} - }, - {"expandmultimer", expandmultimer, &localPar.expandmultimer, COMMAND_PREFILTER, - "Re-prefilter to ensure complete alignment between complexes", - NULL, - "Woosub Kim ", - " ", - CITATION_FOLDSEEK_MULTIMER, { - {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, - {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, - {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, - {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::prefilterDb } - } - }, - {"expandcomplex", expandmultimer, &localPar.expandmultimer, COMMAND_PREFILTER, - "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 }, {"version", versionstring, &localPar.empty, COMMAND_HIDDEN, "", diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index f0463f1a..6523b69b 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -21,12 +21,9 @@ extern int structureungappedalign(int argc, const char** argv, const Command &co extern int convert2pdb(int argc, const char** argv, const Command &command); extern int compressca(int argc, const char** argv, const Command &command); extern int scoremultimer(int argc, const char **argv, const Command& command); -<<<<<<< HEAD extern int filtermultimer(int argc, const char **argv, const Command& command); extern int easymultimerluster(int argc, const char** argv, const Command &command); extern int multimerluster(int argc, const char** argv, const Command &command); -======= ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 extern int easymultimersearch(int argc, const char **argv, const Command &command); extern int createmultimerreport(int argc, const char **argv, const Command &command); extern int expandmultimer(int argc, const char **argv, const Command &command); diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index b9bf89be..1ae01f1c 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -27,24 +27,18 @@ LocalParameters::LocalParameters() : PARAM_FILE_EXCLUDE(PARAM_FILE_EXCLUDE_ID, "--file-exclude", "File Exclusion Regex", "Exclude file names based on this regex", typeid(std::string), (void *) &fileExclude, "^.*$"), PARAM_INDEX_EXCLUDE(PARAM_INDEX_EXCLUDE_ID, "--index-exclude", "Index Exclusion", "Exclude parts of the index:\n0: Full index\n1: Exclude k-mer index (for use with --prefilter-mode 1)\n2: Exclude C-alpha coordinates (for use with --sort-by-structure-bits 0)\nFlags can be combined bit wise", typeid(int), (void *) &indexExclude, "^[0-3]{1}$", MMseqsParameter::COMMAND_EXPERT), PARAM_MULTIMER_REPORT_MODE(PARAM_MULTIMER_REPORT_MODE_ID, "--multimer-report-mode", "Complex report mode", "Complex report mode:\n0: No report\n1: Write complex report", typeid(int), (void *) &multimerReportMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), -<<<<<<< HEAD - PARAM_MULTIMER_REPORT_MODE(PARAM_MULTIMER_REPORT_MODE_ID, "--multimer-report-mode", "Complex report mode", "Complex report mode:\n0: No report\n1: Write complex report", typeid(int), (void *) &multimerReportMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), - PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"), - PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), - PARAM_MULTIMER_TM_THRESHOLD(PARAM_MULTIMER_TM_THRESHOLD_ID,"--multimer-tm-threshold", "TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtMultimerTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), - PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "chain TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), - PARAM_FILTER_MODE(PARAM_FILTER_MODE_ID, "--filter-mode", "Filter mode", "0: Interface\n1: Conformation\n2: loose", typeid(int), (void *) &filterMode, "[0-2]{0}$", MMseqsParameter::COMMAND_CLUST) - -======= PARAM_MULTIMER_REPORT_MODE_BC_COMPAT(PARAM_MULTIMER_REPORT_MODE_BC_COMPAT_ID, "--complex-report-mode", "", "", typeid(int), (void *) &multimerReportMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_HIDDEN), PARAM_EXPAND_MULTIMER_EVALUE(PARAM_EXPAND_MULTIMER_EVALUE_ID, "--expand-multimer-evalue", "Multimer E-value", "E-value threshold for multimer chain expansion (range 0.0-inf)", typeid(double), (void *) &eValueThrExpandMultimer, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN), PARAM_EXPAND_MULTIMER_EVALUE_BC_COMPAT(PARAM_EXPAND_MULTIMER_EVALUE_BC_COMPAT_ID, "--expand-complex-evalue", "", "", typeid(double), (void *) &eValueThrExpandMultimer, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_HIDDEN), PARAM_INPUT_FORMAT(PARAM_INPUT_FORMAT_ID, "--input-format", "Input format", "Format of input structures:\n0: Auto-detect by extension\n1: PDB\n2: mmCIF\n3: mmJSON\n4: ChemComp\n5: Foldcomp", typeid(int), (void *) &inputFormat, "^[0-5]{1}$"), PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), PARAM_PROSTT5_MODEL(PARAM_PROSTT5_MODEL_ID, "--prostt5-model", "Path to ProstT5", "Path to ProstT5 model", typeid(std::string), (void *) &prostt5Model, "^.*$", MMseqsParameter::COMMAND_COMMON), - PARAM_GPU(PARAM_GPU_ID, "--gpu", "Use GPU", "Use GPU (CUDA) if possible", typeid(int), (void *) &gpu, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON) ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 -{ + PARAM_GPU(PARAM_GPU_ID, "--gpu", "Use GPU", "Use GPU (CUDA) if possible", typeid(int), (void *) &gpu, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON), + PARAM_MULTIMER_TM_THRESHOLD(PARAM_MULTIMER_TM_THRESHOLD_ID,"--multimer-tm-threshold", "TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtMultimerTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), + PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "chain TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), + PARAM_FILTER_MODE(PARAM_FILTER_MODE_ID, "--filter-mode", "Filter mode", "0: Interface\n1: Conformation\n2: loose", typeid(int), (void *) &filterMode, "[0-2]{0}$", MMseqsParameter::COMMAND_CLUST) + + { PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id"; PARAM_ALIGNMENT_MODE.regex = "^[0-3]{1}$"; PARAM_ALIGNMENT_MODE.category = MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT; @@ -191,10 +185,9 @@ LocalParameters::LocalParameters() : compressca.push_back(&PARAM_V); //scorecmultimer -<<<<<<< HEAD + scoremultimer.push_back(&PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD); scoremultimer.push_back(&PARAM_THREADS); scoremultimer.push_back(&PARAM_V); - scoremultimer.push_back(&PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD); //filtermultimer filtermultimer.push_back(&PARAM_V); @@ -215,11 +208,12 @@ LocalParameters::LocalParameters() : multimersearchworkflow.push_back(&PARAM_EXPAND_COMPLEX_EVALUE); // easysmultimersearchworkflow - easysmultimersearchworkflow = combineList(structurecreatedb, multimersearchworkflow); - easysmultimersearchworkflow = combineList(easysmultimersearchworkflow, convertalignments); - easysmultimersearchworkflow = combineList(easysmultimersearchworkflow, createmultimerreport); - easysmultimersearchworkflow = combineList(easysmultimersearchworkflow, createsubdb); - easysmultimersearchworkflow.push_back(&PARAM_COMPLEX_REPORT_MODE); + multimersearchworkflow = combineList(structuresearchworkflow, scoremultimer); + multimersearchworkflow = combineList(multimersearchworkflow, expandmultimer); + multimersearchworkflow.push_back(&PARAM_EXPAND_MULTIMER_EVALUE); + multimersearchworkflow.push_back(&PARAM_EXPAND_MULTIMER_EVALUE_BC_COMPAT); + multimersearchworkflow.push_back(&PARAM_MULTIMER_REPORT_MODE); + multimersearchworkflow.push_back(&PARAM_MULTIMER_REPORT_MODE_BC_COMPAT); // multimerclusterworkflow multimerclusterworkflow = combineList(multimersearchworkflow, filtermultimer); @@ -229,37 +223,9 @@ LocalParameters::LocalParameters() : easymultimerlusterworkflow = combineList(structurecreatedb, multimerclusterworkflow); easymultimerlusterworkflow = combineList(easymultimerlusterworkflow, result2repseq); - // expandcomplex - expandcomplex.push_back(&PARAM_THREADS); - expandcomplex.push_back(&PARAM_V); -======= - scoremultimer.push_back(&PARAM_MIN_ASSIGNED_CHAINS_THRESHOLD); - scoremultimer.push_back(&PARAM_THREADS); - scoremultimer.push_back(&PARAM_V); - - // createmultimerreport - createmultimerreport.push_back(&PARAM_DB_OUTPUT); - createmultimerreport.push_back(&PARAM_THREADS); - createmultimerreport.push_back(&PARAM_V); - - // multimersearchworkflow - multimersearchworkflow = combineList(structuresearchworkflow, scoremultimer); - multimersearchworkflow = combineList(multimersearchworkflow, expandmultimer); - multimersearchworkflow.push_back(&PARAM_EXPAND_MULTIMER_EVALUE); - multimersearchworkflow.push_back(&PARAM_EXPAND_MULTIMER_EVALUE_BC_COMPAT); - multimersearchworkflow.push_back(&PARAM_MULTIMER_REPORT_MODE); - multimersearchworkflow.push_back(&PARAM_MULTIMER_REPORT_MODE_BC_COMPAT); - - // easymultimersearchworkflow - easymultimersearchworkflow = combineList(structurecreatedb, multimersearchworkflow); - easymultimersearchworkflow = combineList(easymultimersearchworkflow, convertalignments); - easymultimersearchworkflow = combineList(easymultimersearchworkflow, createmultimerreport); - easymultimersearchworkflow = removeParameter(easymultimersearchworkflow, PARAM_PROSTT5_MODEL); - // expandmultimer expandmultimer.push_back(&PARAM_THREADS); expandmultimer.push_back(&PARAM_V); ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 // convert2pdb convert2pdb.push_back(&PARAM_PDB_OUTPUT_MODE); @@ -292,20 +258,15 @@ LocalParameters::LocalParameters() : indexExclude = 0; multimerReportMode = 1; eValueThrExpandMultimer = 10000.0; -<<<<<<< HEAD citations.emplace(CITATION_FOLDSEEK, "van Kempen, M., Kim, S.S., Tumescheit, C., Mirdita, M., Lee, J., Gilchrist, C.L.M., Söding, J., and Steinegger, M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)"); + citations.emplace(CITATION_FOLDSEEK_MULTIMER, "Kim, W., Mirdita, M., Levy Karin, E., Gilchrist, C.L.M., Schweke, H., Söding, J., Levy, E., and Steinegger, M. Rapid and Sensitive Protein Complex Alignment with Foldseek-Multimer. bioRxiv, doi:10.1101/2024.04.14.589414 (2024)"); + citations.emplace(CITATION_PROSTT5, "Heinzinger, M., Weissenow, K., Gomez Sanchez, J., Henkel, A., Mirdita, M., Steinegger, M., Steinegger, M., and Burkhard, R. Bilingual Language Model for Protein Sequence and Structure. bioRxiv, doi:10.1101/2023.07.23.550085 (2024)"); filtMultimerTmThr = 0.0; filtChainTmThr = 0.0; filterMode = 0; -======= prostt5Model = ""; gpu = 0; - citations.emplace(CITATION_FOLDSEEK, "van Kempen, M., Kim, S.S., Tumescheit, C., Mirdita, M., Lee, J., Gilchrist, C.L.M., Söding, J., and Steinegger, M. Fast and accurate protein structure search with Foldseek. Nature Biotechnology, doi:10.1038/s41587-023-01773-0 (2023)"); - citations.emplace(CITATION_FOLDSEEK_MULTIMER, "Kim, W., Mirdita, M., Levy Karin, E., Gilchrist, C.L.M., Schweke, H., Söding, J., Levy, E., and Steinegger, M. Rapid and Sensitive Protein Complex Alignment with Foldseek-Multimer. bioRxiv, doi:10.1101/2024.04.14.589414 (2024)"); - citations.emplace(CITATION_PROSTT5, "Heinzinger, M., Weissenow, K., Gomez Sanchez, J., Henkel, A., Mirdita, M., Steinegger, M., Steinegger, M., and Burkhard, R. Bilingual Language Model for Protein Sequence and Structure. bioRxiv, doi:10.1101/2023.07.23.550085 (2024)"); ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 - //rewrite param vals. PARAM_FORMAT_OUTPUT.description = "Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,fident,nident,qstart,qend,qlen\ntstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,mismatch,qcov,tcov\nqset,qsetid,tset,tsetid,taxid,taxname,taxlineage,\nlddt,lddtfull,qca,tca,t,u,qtmscore,ttmscore,alntmscore,rmsd,prob\ncomplexqtmscore,complexttmscore,complexu,complext,complexassignid\n"; } @@ -371,11 +332,7 @@ std::vector LocalParameters::getOutputFormat(int formatMode, const std::str else if (outformatSplit[i].compare("lddtfull") == 0) { needQCa = true; needTCa = true; needLDDT = true; needBacktrace = true; code = LocalParameters::OUTFMT_LDDT_FULL; } else if (outformatSplit[i].compare("prob") == 0) { needQCa = true; needTCa = true; needLDDT = true; needBacktrace = true; needTMaligner = true; code = LocalParameters::OUTFMT_PROBTP; } // TODO -<<<<<<< HEAD - else if (outformatSplit[i].compare("complexqtmscore")==0 || outformatSplit[i].compare("multimerqtmscore")==0) {code=LocalParameters::OUTFMT_Q_COMPLEX_TMSCORE; } -======= else if (outformatSplit[i].compare("complexqtmscore")==0 || outformatSplit[i].compare("multimerqtmscore")==0){code=LocalParameters::OUTFMT_Q_COMPLEX_TMSCORE; } ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 else if (outformatSplit[i].compare("complexttmscore")==0 || outformatSplit[i].compare("multimerttmscore")==0){code=LocalParameters::OUTFMT_T_COMPLEX_TMSCORE;} else if (outformatSplit[i].compare("complexassignid")==0 || outformatSplit[i].compare("multimerassignid")==0){code=LocalParameters::OUTFMT_ASSIGN_ID;} else if (outformatSplit[i].compare("complexu")==0 || outformatSplit[i].compare("multimeru")==0){code=LocalParameters::OUTFMT_COMPLEX_U;} diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 33441905..9319bcaa 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -101,16 +101,11 @@ class LocalParameters : public Parameters { std::vector structurecreatedb; std::vector compressca; std::vector scoremultimer; -<<<<<<< HEAD std::vector filtermultimer; std::vector multimerclusterworkflow; std::vector easymultimerlusterworkflow; - std::vector multimersearchworkflow; - std::vector easysmultimersearchworkflow; -======= std::vector multimersearchworkflow; std::vector easymultimersearchworkflow; ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 std::vector createmultimerreport; std::vector expandmultimer; std::vector convert2pdb; @@ -134,14 +129,6 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_FILE_EXCLUDE) PARAMETER(PARAM_INDEX_EXCLUDE) PARAMETER(PARAM_MULTIMER_REPORT_MODE) -<<<<<<< HEAD - PARAMETER(PARAM_EXPAND_MULTIMER_EVALUE) - PARAMETER(PARAM_INPUT_FORMAT) - PARAMETER(PARAM_PDB_OUTPUT_MODE) - PARAMETER(PARAM_MULTIMER_TM_THRESHOLD) - PARAMETER(PARAM_CHAIN_TM_THRESHOLD) - PARAMETER(PARAM_FILTER_MODE) -======= PARAMETER(PARAM_MULTIMER_REPORT_MODE_BC_COMPAT) PARAMETER(PARAM_EXPAND_MULTIMER_EVALUE) PARAMETER(PARAM_EXPAND_MULTIMER_EVALUE_BC_COMPAT) @@ -149,7 +136,12 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_PDB_OUTPUT_MODE) PARAMETER(PARAM_PROSTT5_MODEL) PARAMETER(PARAM_GPU) ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 + PARAMETER(PARAM_MULTIMER_TM_THRESHOLD) + PARAMETER(PARAM_CHAIN_TM_THRESHOLD) + PARAMETER(PARAM_FILTER_MODE) + + + int prefMode; float tmScoreThr; @@ -173,14 +165,11 @@ class LocalParameters : public Parameters { double eValueThrExpandMultimer; int inputFormat; int pdbOutputMode; -<<<<<<< HEAD float filtMultimerTmThr; float filtChainTmThr; int filterMode; -======= std::string prostt5Model; int gpu; ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy, bool &needQCa, bool &needTCa, bool &needTMaligner, diff --git a/src/strucclustutils/CMakeLists.txt b/src/strucclustutils/CMakeLists.txt index d78794cf..f9068165 100644 --- a/src/strucclustutils/CMakeLists.txt +++ b/src/strucclustutils/CMakeLists.txt @@ -14,14 +14,9 @@ set(strucclustutils_source_files strucclustutils/convert2pdb.cpp strucclustutils/compressca.cpp strucclustutils/scoremultimer.cpp -<<<<<<< HEAD strucclustutils/filtermultimer.cpp - strucclustutils/createmultimereport.cpp - strucclustutils/createcmultimerreport.h -======= strucclustutils/createmultimerreport.cpp strucclustutils/MultimerUtil.h ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 strucclustutils/expandmultimer.cpp PARENT_SCOPE ) diff --git a/src/strucclustutils/structcreatedb.cpp b/src/strucclustutils/structcreatedb.cpp index fbc44d0a..cf224212 100644 --- a/src/strucclustutils/structcreatedb.cpp +++ b/src/strucclustutils/structcreatedb.cpp @@ -161,16 +161,12 @@ writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, Stru torsiondbw.writeData(alphabet3di.data(), alphabet3di.size(), dbKey, thread_idx); aadbw.writeData(alphabetAA.data(), alphabetAA.size(), dbKey, thread_idx); header.clear(); -<<<<<<< HEAD - header.append(Util::remove_extension(readStructure.names[ch])); -======= if (Util::endsWith(".gz", readStructure.names[ch])){ header.append(Util::remove_extension(Util::remove_extension(readStructure.names[ch]))); } else{ header.append(Util::remove_extension(readStructure.names[ch])); } ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 if(readStructure.modelCount > 1){ header.append("_MODEL_"); header.append(std::to_string(readStructure.modelIndices[ch])); @@ -188,28 +184,19 @@ writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, Stru std::string entryName = Util::parseFastaHeader(header.c_str()); #pragma omp critical { -<<<<<<< HEAD - std::map::iterator it = filenameToFileId.find(Util::remove_extension(filename)); -======= std::string filenameWithExtension = filename; if (Util::endsWith(".gz", filename)){ filenameWithExtension = Util::remove_extension(filename); } std::string filenameWithoutExtension = Util::remove_extension(filenameWithExtension); std::map::iterator it = filenameToFileId.find(filenameWithoutExtension); ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 size_t fileid; if (it != filenameToFileId.end()) { fileid = it->second; } else { fileid = fileidCnt; -<<<<<<< HEAD - filenameToFileId[Util::remove_extension(filename)] = fileid; - fileIdToName[fileid] = Util::remove_extension(filename); -======= filenameToFileId[filenameWithoutExtension] = fileid; fileIdToName[fileid] = filenameWithoutExtension; ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 fileidCnt++; } entrynameToFileId[entryName] = std::make_pair(fileid, readStructure.modelIndices[ch]); diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt index a9de71ed..88cc3daa 100644 --- a/src/workflow/CMakeLists.txt +++ b/src/workflow/CMakeLists.txt @@ -7,12 +7,8 @@ set(workflow_source_files workflow/EasyStructureSearch.cpp workflow/EasyStructureCluster.cpp workflow/EasyMultimerSearch.cpp -<<<<<<< HEAD - workflow/Multimerearch.cpp + workflow/MultimerSearch.cpp workflow/MultimerCluster.cpp workflow/EasyMultimerCluster.cpp -======= - workflow/MultimerSearch.cpp ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851 PARENT_SCOPE ) diff --git a/src/workflow/EasyMultimerSearch.cpp b/src/workflow/EasyMultimerSearch.cpp index 3239463c..a66b07f3 100644 --- a/src/workflow/EasyMultimerSearch.cpp +++ b/src/workflow/EasyMultimerSearch.cpp @@ -7,11 +7,6 @@ #include "Debug.h" #include "easymultimersearch.sh.h" -<<<<<<< HEAD:src/workflow/EasyComplexSearch.cpp -#include "easymultimersearch.sh.h" - -======= ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851:src/workflow/EasyMultimerSearch.cpp int easymultimersearch(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); diff --git a/src/workflow/MultimerSearch.cpp b/src/workflow/MultimerSearch.cpp index 3acf09a4..34ffd0c3 100644 --- a/src/workflow/MultimerSearch.cpp +++ b/src/workflow/MultimerSearch.cpp @@ -8,10 +8,6 @@ #include "multimersearch.sh.h" -<<<<<<< HEAD:src/workflow/ComplexSearch.cpp - -======= ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851:src/workflow/MultimerSearch.cpp int multimersearch(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); @@ -110,12 +106,8 @@ int multimersearch(int argc, const char **argv, const Command &command) { cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str()); -<<<<<<< HEAD:src/workflow/ComplexSearch.cpp - std::string program = tmpDir + "/multimersearch.sh"; -======= // cmd.addVariable("EXP_MULTIMER_PAR", ("-e " + std::to_string(par.eValueThrExpandMultimer)).c_str()); std::string program = tmpDir + "/multimersearch.sh"; ->>>>>>> 25812ffa585248b146fb0217b981b507dc92e851:src/workflow/MultimerSearch.cpp FileUtil::writeFile(program, multimersearch_sh, multimersearch_sh_len); cmd.execProgram(program.c_str(), par.filenames); // Should never get here From a82587c6311cb00664d909f2978073bbc8cddbdb Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Sun, 14 Jul 2024 22:22:19 +0900 Subject: [PATCH 103/160] minor --- data/easymultimercluster.sh | 6 +-- data/multimercluster.sh | 2 +- src/workflow/EasyMultimerCluster.cpp | 77 ++++++++++++++++++++++++++++ src/workflow/MultimerCluster.cpp | 73 ++++++++++++++++++++++++++ 4 files changed, 154 insertions(+), 4 deletions(-) create mode 100644 src/workflow/EasyMultimerCluster.cpp create mode 100644 src/workflow/MultimerCluster.cpp diff --git a/data/easymultimercluster.sh b/data/easymultimercluster.sh index 00e95db1..35d11fcf 100644 --- a/data/easymultimercluster.sh +++ b/data/easymultimercluster.sh @@ -88,8 +88,8 @@ fi if notExists "${TMP_PATH}/complex_clu.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" complexcluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clu" "${TMP_PATH}" ${COMPLEXCLUSTER_PAR} \ - || fail "Complexcluster died" + "$MMSEQS" multimerxcluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clu" "${TMP_PATH}" ${MULTIMERCLUSTER_PAR} \ + || fail "Multimercluster died" fi SOURCE="${TMP_PATH}/input" @@ -155,5 +155,5 @@ if [ -n "${REMOVE_TMP}" ]; then "$MMSEQS" rmdb "${TMP_PATH}/input_ss" ${VERBOSITY_PAR} rm "${TMP_PATH}/rep_seqs.list" rm -rf "${TMP_PATH}/latest" - rm -f "${TMP_PATH}/easycomplexcluster.sh" + rm -f "${TMP_PATH}/easymultimercluster.sh" fi \ No newline at end of file diff --git a/data/multimercluster.sh b/data/multimercluster.sh index d36251b6..0376d8cc 100644 --- a/data/multimercluster.sh +++ b/data/multimercluster.sh @@ -124,5 +124,5 @@ if [ -n "${REMOVE_TMP}" ]; then rm "${TMP_PATH}/complex_header.tsv" rm "${TMP_PATH}/complex_header.tsv_redundant" rm -rf "${TMP_PATH}/complexsearch_tmp" - rm -f "${TMP_PATH}/complexcluster.sh" + rm -f "${TMP_PATH}/multimercluster.sh" fi \ No newline at end of file diff --git a/src/workflow/EasyMultimerCluster.cpp b/src/workflow/EasyMultimerCluster.cpp new file mode 100644 index 00000000..88d081d2 --- /dev/null +++ b/src/workflow/EasyMultimerCluster.cpp @@ -0,0 +1,77 @@ +#include + +#include "LocalParameters.h" +#include "FileUtil.h" +#include "CommandCaller.h" +#include "Util.h" +#include "Debug.h" + +#include "easymultimercluster.sh.h" + +void setEasyMultimerClusterDefaults(Parameters *p) { + //TODO + p->removeTmpFiles = true; + p->writeLookup = true; +} + +void setEasyMultimerClusterMustPassAlong(Parameters *p) { + //TODO + p->PARAM_REMOVE_TMP_FILES.wasSet = true; + p->PARAM_WRITE_LOOKUP.wasSet = true; +} + +int easymultimercluster(int argc, const char **argv, const Command &command) { + LocalParameters &par = LocalParameters::getLocalInstance(); + //TODO + par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); //prefilter + par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT); + + for (size_t i = 0; i < par.createdb.size(); i++){ + par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); + } + par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + + setEasyMultimerMultimerClusterDefaults(&par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + setEasyMultimerClusterMustPassAlong(&par); + + std::string tmpDir = par.filenames.back(); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + if (par.reuseLatest) { + hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); + } + + tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); + par.filenames.pop_back(); + + CommandCaller cmd; + cmd.addVariable("TMP_PATH", tmpDir.c_str()); + cmd.addVariable("RESULT", par.filenames.back().c_str()); + par.filenames.pop_back(); + cmd.addVariable("INPUT", par.filenames.back().c_str()); + par.filenames.pop_back(); + + cmd.addVariable("RUNNER", par.runner.c_str()); + cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); + cmd.addVariable("MULTIMERCLUSTER_PAR", par.createParameterString(par.multimerclusterworkflow,true).c_str()); + cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); + cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str()); + cmd.addVariable("RESULT2REPSEQ_PAR", par.createParameterString(par.result2repseq).c_str()); + cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); + cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); + + std::string program = tmpDir + "/easymultimercluster.sh"; + FileUtil::writeFile(program, easymultimercluster_sh, easymultimercluster_sh_len); + + cmd.execProgram(program.c_str(), par.filenames); + + // Should never get here + assert(false); + return EXIT_FAILURE; +} diff --git a/src/workflow/MultimerCluster.cpp b/src/workflow/MultimerCluster.cpp new file mode 100644 index 00000000..69315cb1 --- /dev/null +++ b/src/workflow/MultimerCluster.cpp @@ -0,0 +1,73 @@ +#include + +#include "FileUtil.h" +#include "CommandCaller.h" +#include "Util.h" +#include "Debug.h" +#include "LocalParameters.h" + +#include "multimercluster.sh.h" + +void setMultimerClusterDefaults(LocalParameters *p) { + p->covThr = 0.8; + p->filtMultimerTmThr = 0.5; // FIX + // p->filtChainTmThr=0.0; // FIX + p->filterMode=0; + p->covMode = 1; + p->clusteringMode = Parameters::GREEDY; + p->removeTmpFiles = true; +} + +void setMultimerClusterMustPassAlong(Parameters *p) { + p->PARAM_C.wasSet = true; + p->PARAM_REMOVE_TMP_FILES.wasSet = true; +} +int multimercluster(int argc, const char **argv, const Command &command) { + LocalParameters &par = LocalParameters::getLocalInstance(); + par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); //prefilter + par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); //align + for (size_t i = 0; i < par.createdb.size(); i++){ + par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); + } + par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); + par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); + + setMultimerClusterDefaults(&par); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + setMultimerClusterMustPassAlong(&par); + + std::string tmpDir = par.filenames.back(); + std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); + if (par.reuseLatest) { + hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); + } + tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); + par.filenames.pop_back(); + + CommandCaller cmd; + std::cout< Date: Mon, 15 Jul 2024 01:25:54 +0900 Subject: [PATCH 104/160] minor --- data/CMakeLists.txt | 2 +- data/easymultimercluster.sh | 2 +- data/multimercluster.sh | 8 +-- src/FoldseekBase.cpp | 37 +++++++++---- src/LocalCommandDeclarations.h | 4 +- src/commons/LocalParameters.cpp | 17 ++++-- src/commons/LocalParameters.h | 2 +- src/strucclustutils/filtermultimer.cpp | 45 +++++++-------- src/workflow/ComplexCluster.cpp | 73 ------------------------ src/workflow/EasyComplexCluster.cpp | 77 -------------------------- src/workflow/EasyMultimerCluster.cpp | 2 +- src/workflow/MultimerCluster.cpp | 4 +- 12 files changed, 70 insertions(+), 203 deletions(-) delete mode 100644 src/workflow/ComplexCluster.cpp delete mode 100644 src/workflow/EasyComplexCluster.cpp diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index 866fc66d..1c7b0622 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -15,7 +15,7 @@ set(COMPILED_RESOURCES vendor.js.zst multimersearch.sh easymultimersearch.sh - multimerluster.sh + multimercluster.sh easymultimercluster.sh ) diff --git a/data/easymultimercluster.sh b/data/easymultimercluster.sh index 35d11fcf..227519f6 100644 --- a/data/easymultimercluster.sh +++ b/data/easymultimercluster.sh @@ -88,7 +88,7 @@ fi if notExists "${TMP_PATH}/complex_clu.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" multimerxcluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clu" "${TMP_PATH}" ${MULTIMERCLUSTER_PAR} \ + "$MMSEQS" multimercluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clu" "${TMP_PATH}" ${MULTIMERCLUSTER_PAR} \ || fail "Multimercluster died" fi diff --git a/data/multimercluster.sh b/data/multimercluster.sh index 0376d8cc..56ef5bb0 100644 --- a/data/multimercluster.sh +++ b/data/multimercluster.sh @@ -78,14 +78,14 @@ buldCmplhDb(){ if notExists "${TMP_PATH}/complex_result.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" multimersearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/multimersearch_tmp" ${COMPLEXSEARCH_PAR} \ + "$MMSEQS" multimersearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/multimersearch_tmp" ${MULTIMERSEARCH_PAR} \ || fail "multimerSearch died" fi if notExists "complex_filt.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" filtercomplex "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" "${TMP_PATH}/filtcov.tsv" ${FILTERCOMPLEX_PAR} \ - || fail "FilterComplex died" + "$MMSEQS" filtermultimer "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" "${TMP_PATH}/filtcov.tsv" ${FILTERMULTIMER_PAR} \ + || fail "FilterMultimer died" fi # shift query DB, .index, .dbtype @@ -123,6 +123,6 @@ if [ -n "${REMOVE_TMP}" ]; then "$MMSEQS" rmdb "${TMP_PATH}/complex_result" ${VERBOSITY_PAR} rm "${TMP_PATH}/complex_header.tsv" rm "${TMP_PATH}/complex_header.tsv_redundant" - rm -rf "${TMP_PATH}/complexsearch_tmp" + rm -rf "${TMP_PATH}/multimersearch_tmp" rm -f "${TMP_PATH}/multimercluster.sh" fi \ No newline at end of file diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 9837ed8c..719264ca 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -258,8 +258,8 @@ std::vector foldseekCommands = { CITATION_FOLDSEEK, {{"Db", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb }, {"pdbFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, {"scoremultimer", scoremultimer, &localPar.scoremultimer, COMMAND_ALIGNMENT, - "Get complex level alignments from alignmentDB", - "# Get complex level alignments (chain assignments and tm-scores) from alignmentDB.\n" + "Get Multimer level alignments from alignmentDB", + "# Get multimer level alignments (chain assignments and tm-scores) from alignmentDB.\n" "foldseek scoremultimer queryDB targetDB alignmentDB complexDB\n" "# simple tsv output format" "foldseek createmultimerreport queryDB targetDB complexDB result.tsv" @@ -274,6 +274,9 @@ std::vector foldseekCommands = { {"complexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb} } }, + {"scorecomplex", scoremultimer, &localPar.scoremultimer, COMMAND_HIDDEN, + "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} + }, {"filtermultimer", filtermultimer, &localPar.filtermultimer, COMMAND_HIDDEN, "Filters multimers satisfying given coverage", "foldseek filtermultimer queryDB targetDB alignmentDB complexDB -c 0.8 --cov-mode 1\n", @@ -329,8 +332,8 @@ std::vector foldseekCommands = { } }, {"multimersearch", multimersearch, &localPar.multimersearchworkflow, COMMAND_MAIN, - "Complex level search", - "# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n" + "Multimer level search", + "# Search a single/multiple PDB file against a set of PDB files and get multimer level alignments\n" "foldseek multimersearch queryDB targetDB result tmp\n" "# Format output differently\n" "foldseek multimersearch queryDB targetDB result tmp --format-output query,target,qstart,tstart,cigar\n" @@ -347,9 +350,12 @@ std::vector foldseekCommands = { {"tempDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory} } }, - {"easy-multimersearch", easymultimersearch, &localPar.easysmultimersearchworkflow, COMMAND_EASY, - "Complex level search", - "# Search a single/multiple PDB file against a set of PDB files and get complex level alignments\n" + {"complexsearch", multimersearch, &localPar.multimersearchworkflow, COMMAND_HIDDEN, + "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} + }, + {"easy-multimersearch", easymultimersearch, &localPar.easymultimersearchworkflow, COMMAND_EASY, + "Multimer level search", + "# Search a single/multiple PDB file against a set of PDB files and get multimer level alignments\n" "foldseek easy-multimersearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp\n" "# Format output differently\n" "foldseek easy-multimersearch example/1tim.pdb.gz example/8tim.pdb.gz result tmp --format-output query,target,qstart,tstart,cigar\n" @@ -366,11 +372,14 @@ std::vector foldseekCommands = { {"tempDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory} } }, - {"createmultimerreport", createcomplexreport, &localPar.createcomplexreport, COMMAND_FORMAT_CONVERSION, + {"easy-complexsearch", easymultimersearch, &localPar.easymultimersearchworkflow, COMMAND_EASY, + "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} + }, + {"createmultimerreport", createmultimerreport, &localPar.createmultimerreport, COMMAND_FORMAT_CONVERSION, "Convert complexDB to tsv format", "# Create output in tsv format (9 columns): qComplexName.c_str(), tComplexName.c_str(), qChainString.c_str(), tChainString.c_str(), qTMScore, tTMScore, u, t, assId\n" - "# (1,2) identifiers for query and target complex,\n" - "# (3,4) chains of query complex and target complex,\n" + "# (1,2) identifiers for query and target multimer,\n" + "# (3,4) chains of query multimer and target multimer,\n" "# (5,6) tm score based on query and target residue length,\n" "# (8,9) u and t,\n" "# (9) assignment id\n" @@ -384,8 +393,11 @@ std::vector foldseekCommands = { {"complexFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile} } }, + {"createcomplexreport", createmultimerreport, &localPar.createmultimerreport, COMMAND_FORMAT_CONVERSION, + "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} + }, {"expandmultimer", expandmultimer, &localPar.expandmultimer, COMMAND_PREFILTER, - "Re-prefilter to ensure complete alignment between complexes", + "Re-prefilter to ensure complete alignment between multimers", NULL, "Woosub Kim ", " ", @@ -396,6 +408,9 @@ std::vector foldseekCommands = { {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::prefilterDb } } }, + {"expandcomplex", expandmultimer, &localPar.expandmultimer, COMMAND_PREFILTER, + "", NULL, "", "", CITATION_FOLDSEEK_MULTIMER, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}} + }, {"version", versionstring, &localPar.empty, COMMAND_HIDDEN, "", NULL, diff --git a/src/LocalCommandDeclarations.h b/src/LocalCommandDeclarations.h index 6523b69b..c362cc12 100644 --- a/src/LocalCommandDeclarations.h +++ b/src/LocalCommandDeclarations.h @@ -22,8 +22,8 @@ extern int convert2pdb(int argc, const char** argv, const Command &command); extern int compressca(int argc, const char** argv, const Command &command); extern int scoremultimer(int argc, const char **argv, const Command& command); extern int filtermultimer(int argc, const char **argv, const Command& command); -extern int easymultimerluster(int argc, const char** argv, const Command &command); -extern int multimerluster(int argc, const char** argv, const Command &command); +extern int easymultimercluster(int argc, const char** argv, const Command &command); +extern int multimercluster(int argc, const char** argv, const Command &command); extern int easymultimersearch(int argc, const char **argv, const Command &command); extern int createmultimerreport(int argc, const char **argv, const Command &command); extern int expandmultimer(int argc, const char **argv, const Command &command); diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 1ae01f1c..5b19d89c 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -194,7 +194,7 @@ LocalParameters::LocalParameters() : filtermultimer.push_back(&PARAM_THREADS); filtermultimer.push_back(&PARAM_C); filtermultimer.push_back(&PARAM_COV_MODE); - filtermultimer.push_back(&PARAM_COMPLEX_TM_THRESHOLD); + filtermultimer.push_back(&PARAM_MULTIMER_TM_THRESHOLD); filtermultimer.push_back(&PARAM_CHAIN_TM_THRESHOLD); filtermultimer.push_back(&PARAM_FILTER_MODE); @@ -205,7 +205,7 @@ LocalParameters::LocalParameters() : // multimersearchworkflow multimersearchworkflow = combineList(structuresearchworkflow, scoremultimer); - multimersearchworkflow.push_back(&PARAM_EXPAND_COMPLEX_EVALUE); + multimersearchworkflow.push_back(&PARAM_EXPAND_MULTIMER_EVALUE); // easysmultimersearchworkflow multimersearchworkflow = combineList(structuresearchworkflow, scoremultimer); @@ -215,13 +215,20 @@ LocalParameters::LocalParameters() : multimersearchworkflow.push_back(&PARAM_MULTIMER_REPORT_MODE); multimersearchworkflow.push_back(&PARAM_MULTIMER_REPORT_MODE_BC_COMPAT); + // easymultimersearchworkflow + easymultimersearchworkflow = combineList(structurecreatedb, multimersearchworkflow); + easymultimersearchworkflow = combineList(easymultimersearchworkflow, convertalignments); + easymultimersearchworkflow = combineList(easymultimersearchworkflow, createmultimerreport); + easymultimersearchworkflow = removeParameter(easymultimersearchworkflow, PARAM_PROSTT5_MODEL); + + // multimerclusterworkflow multimerclusterworkflow = combineList(multimersearchworkflow, filtermultimer); multimerclusterworkflow = combineList(multimerclusterworkflow, clust); - //easymultimerlusterworkflow - easymultimerlusterworkflow = combineList(structurecreatedb, multimerclusterworkflow); - easymultimerlusterworkflow = combineList(easymultimerlusterworkflow, result2repseq); + //easymultimerclusterworkflow + easymultimerclusterworkflow = combineList(structurecreatedb, multimerclusterworkflow); + easymultimerclusterworkflow = combineList(easymultimerclusterworkflow, result2repseq); // expandmultimer expandmultimer.push_back(&PARAM_THREADS); diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 9319bcaa..cbee8feb 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -103,7 +103,7 @@ class LocalParameters : public Parameters { std::vector scoremultimer; std::vector filtermultimer; std::vector multimerclusterworkflow; - std::vector easymultimerlusterworkflow; + std::vector easymultimerclusterworkflow; std::vector multimersearchworkflow; std::vector easymultimersearchworkflow; std::vector createmultimerreport; diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index a1200209..67763680 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -9,17 +9,15 @@ #include "MemoryMapped.h" #include "Coordinate16.h" #include "tmalign/basic_fun.h" -#include "createcomplexreport.h" +#include "MultimerUtil.h" #include "LDDT.h" #include "CalcProbTP.h" #include - #ifdef OPENMP #include #endif - unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: @@ -38,22 +36,21 @@ unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { class ComplexFilterCriteria { public: unsigned int dbKey; - unsigned int qTotalAlnLen; - unsigned int tTotalAlnLen; - float qCov; - float tCov; double qTM; double tTM; - std::vector alignedQChainTmScores; - std::vector alignedTChainTmScores; std::vector qChainKeys; std::vector tChainKeys; float t[3]; float u[3][3]; + unsigned int qTotalAlnLen; + unsigned int tTotalAlnLen; + float qCov; + float tCov; + std::vector alignedQChainTmScores; + std::vector alignedTChainTmScores; - ComplexFilterCriteria() {} - ComplexFilterCriteria(unsigned int dbKey, std::vector &qChainKeys, std::vector &tChainKeys, double qTM, double tTM, float tstring[3], float ustring[3][3]) : + ComplexFilterCriteria(unsigned int dbKey, double qTM, double tTM, std::vector &qChainKeys, std::vector &tChainKeys, float tstring[3], float ustring[3][3]) : dbKey(dbKey), qTM(qTM), tTM(tTM), qChainKeys(qChainKeys), tChainKeys(tChainKeys), qTotalAlnLen(0), tTotalAlnLen(0) { std::copy(tstring, tstring + 3, t); for (int i = 0; i < 3; i++) { @@ -67,7 +64,6 @@ class ComplexFilterCriteria { bool hasTM(float TMThr, int covMode, int filterMode){ switch (filterMode){ - case LocalParameters::FILTER_MODE_INTERFACE: case LocalParameters::FILTER_MODE_LOOSE: switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: @@ -81,12 +77,13 @@ class ComplexFilterCriteria { case Parameters::COV_MODE_LENGTH_SHORTER : return true; } + case LocalParameters::FILTER_MODE_INTERFACE: case LocalParameters::FILTER_MODE_CONFORMATION: return true; } } - bool hasChainNum(int covMode, int filterMode, int qChainNum, int tChainNum ){ + bool hasChainNum(int covMode, int filterMode, size_t qChainNum, size_t tChainNum ){ switch (filterMode){ case LocalParameters::FILTER_MODE_INTERFACE: switch (covMode) { @@ -116,11 +113,9 @@ class ComplexFilterCriteria { } case LocalParameters::FILTER_MODE_LOOSE: return true; - } } - bool hasChainTm(float chainTMThr, int covMode, int filterMode, unsigned int qChainNum, unsigned int tChainNum) { switch (filterMode){ case LocalParameters::FILTER_MODE_INTERFACE: @@ -158,31 +153,31 @@ class ComplexFilterCriteria { case Parameters::COV_MODE_LENGTH_QUERY : case Parameters::COV_MODE_LENGTH_TARGET : case Parameters::COV_MODE_LENGTH_SHORTER : - break; - return true; + return true; } + return true; case LocalParameters::FILTER_MODE_CONFORMATION: case LocalParameters::FILTER_MODE_LOOSE: return true; - + } } bool isConformation(int filterMode, float chainTMThr){ switch (filterMode){ case LocalParameters::FILTER_MODE_CONFORMATION: - + //TODO case LocalParameters::FILTER_MODE_INTERFACE: case LocalParameters::FILTER_MODE_LOOSE: return true; - + } } - bool satisfy(int covMode, int filterMode, float covThr, float TMThr, float chainTMThr, int qChainNum, int tChainNum ) { + bool satisfy(int covMode, int filterMode, float covThr, float TMThr, float chainTMThr, size_t qChainNum, size_t tChainNum ) { const bool covOK = Util::hasCoverage(covThr, covMode, qCov, tCov); const bool TMOK = hasTM(TMThr, covMode, filterMode); const bool chainNumOK = hasChainNum(covMode, filterMode, qChainNum, tChainNum); const bool chainTMOK = hasChainTm(chainTMThr, covMode, filterMode, qChainNum, tChainNum); - const bool conformationOK = isConformation(filterMode, chainTMThr) + const bool conformationOK = isConformation(filterMode, chainTMThr); return (covOK && TMOK && chainNumOK && chainTMOK); } @@ -509,7 +504,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); if (localComplexMap.find(assId) == localComplexMap.end()) { - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, qChainKeys, tChainKeys, retComplex.qTmScore, retComplex.tTmScore, t, u); + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, retComplex.qTmScore, retComplex.tTmScore, qChainKeys, tChainKeys, t, u); localComplexMap[assId] = cmplfiltcrit; localComplexMap.at(assId).update(qChainKey, tChainKey, qtotalaln, ttotalaln, qChainTm, tChainTm); } else { @@ -522,7 +517,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); - if (!(assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr, qChainKeys.size(), tChainKeys.size()))){ + if (!(assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, qChainKeys.size(), tChainKeys.size()))){ assIdsToDelete.push_back(assId_res.first); } } @@ -598,4 +593,4 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t tComplexLength.clear(); return EXIT_SUCCESS; -} +} \ No newline at end of file diff --git a/src/workflow/ComplexCluster.cpp b/src/workflow/ComplexCluster.cpp deleted file mode 100644 index 823d9a11..00000000 --- a/src/workflow/ComplexCluster.cpp +++ /dev/null @@ -1,73 +0,0 @@ -#include - -#include "FileUtil.h" -#include "CommandCaller.h" -#include "Util.h" -#include "Debug.h" -#include "LocalParameters.h" - -#include "complexcluster.sh.h" - -void setComplexClusterDefaults(LocalParameters *p) { - p->covThr = 0.8; - p->filtComplexTmThr = 0.5; // FIX - // p->filtChainTmThr=0.0; // FIX - p->filterMode=0; - p->covMode = 1; - p->clusteringMode = Parameters::GREEDY; - p->removeTmpFiles = true; -} - -void setComplexClusterMustPassAlong(Parameters *p) { - p->PARAM_C.wasSet = true; - p->PARAM_REMOVE_TMP_FILES.wasSet = true; -} -int complexcluster(int argc, const char **argv, const Command &command) { - LocalParameters &par = LocalParameters::getLocalInstance(); - par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); //align - par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); //prefilter - par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); //align - par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); //align - par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); //align - for (size_t i = 0; i < par.createdb.size(); i++){ - par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); - } - par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); - - setComplexClusterDefaults(&par); - par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); - setComplexClusterMustPassAlong(&par); - - std::string tmpDir = par.filenames.back(); - std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); - if (par.reuseLatest) { - hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); - } - tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); - par.filenames.pop_back(); - - CommandCaller cmd; - std::cout< - -#include "LocalParameters.h" -#include "FileUtil.h" -#include "CommandCaller.h" -#include "Util.h" -#include "Debug.h" - -#include "easycomplexcluster.sh.h" - -void setEasyComplexClusterDefaults(Parameters *p) { - //TODO - p->removeTmpFiles = true; - p->writeLookup = true; -} - -void setEasyComplexClusterMustPassAlong(Parameters *p) { - //TODO - p->PARAM_REMOVE_TMP_FILES.wasSet = true; - p->PARAM_WRITE_LOOKUP.wasSet = true; -} - -int easycomplexcluster(int argc, const char **argv, const Command &command) { - LocalParameters &par = LocalParameters::getLocalInstance(); - //TODO - par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); //align - par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT); //prefilter - par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT); //align - par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT); //align - par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT); //align - par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT); - - for (size_t i = 0; i < par.createdb.size(); i++){ - par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT); - } - par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); - par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); - - setEasyComplexClusterDefaults(&par); - par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); - setEasyComplexClusterMustPassAlong(&par); - - std::string tmpDir = par.filenames.back(); - std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); - if (par.reuseLatest) { - hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); - } - - tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); - par.filenames.pop_back(); - - CommandCaller cmd; - cmd.addVariable("TMP_PATH", tmpDir.c_str()); - cmd.addVariable("RESULT", par.filenames.back().c_str()); - par.filenames.pop_back(); - cmd.addVariable("INPUT", par.filenames.back().c_str()); - par.filenames.pop_back(); - - cmd.addVariable("RUNNER", par.runner.c_str()); - cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.structurecreatedb).c_str()); - cmd.addVariable("COMPLEXCLUSTER_PAR", par.createParameterString(par.complexclusterworkflow,true).c_str()); - cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); - cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str()); - cmd.addVariable("RESULT2REPSEQ_PAR", par.createParameterString(par.result2repseq).c_str()); - cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); - cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); - - std::string program = tmpDir + "/easycomplexcluster.sh"; - FileUtil::writeFile(program, easycomplexcluster_sh, easycomplexcluster_sh_len); - - cmd.execProgram(program.c_str(), par.filenames); - - // Should never get here - assert(false); - return EXIT_FAILURE; -} diff --git a/src/workflow/EasyMultimerCluster.cpp b/src/workflow/EasyMultimerCluster.cpp index 88d081d2..129a9b07 100644 --- a/src/workflow/EasyMultimerCluster.cpp +++ b/src/workflow/EasyMultimerCluster.cpp @@ -37,7 +37,7 @@ int easymultimercluster(int argc, const char **argv, const Command &command) { par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT); par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT); - setEasyMultimerMultimerClusterDefaults(&par); + setEasyMultimerClusterDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); setEasyMultimerClusterMustPassAlong(&par); diff --git a/src/workflow/MultimerCluster.cpp b/src/workflow/MultimerCluster.cpp index 69315cb1..c576fb8b 100644 --- a/src/workflow/MultimerCluster.cpp +++ b/src/workflow/MultimerCluster.cpp @@ -56,8 +56,8 @@ int multimercluster(int argc, const char **argv, const Command &command) { cmd.addVariable("INPUT", par.filenames.back().c_str()); par.filenames.pop_back(); - cmd.addVariable("COMPLEXSEARCH_PAR", par.createParameterString(par.complexsearchworkflow, true).c_str()); - cmd.addVariable("FILTERCOMPLEX_PAR", par.createParameterString(par.filtercomplex).c_str()); + cmd.addVariable("MULTIMERSEARCH_PAR", par.createParameterString(par.multimersearchworkflow, true).c_str()); + cmd.addVariable("FILTERMULTIMER_PAR", par.createParameterString(par.filtermultimer).c_str()); cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clust).c_str()); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); From 3df6bc466ef95cf2a894cc211df9feb45b479ea9 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 15 Jul 2024 11:09:12 +0900 Subject: [PATCH 105/160] solved everything --- src/FoldseekBase.cpp | 1 - src/strucclustutils/filtermultimer.cpp | 33 ++++++++------------------ 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index 719264ca..d089b6bd 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -287,7 +287,6 @@ std::vector foldseekCommands = { {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, {"clustDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, - {"tmptsv", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::flatfile } } }, diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 67763680..0be890f7 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -26,9 +26,7 @@ unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { return qcov; case Parameters::COV_MODE_QUERY: return tcov; - case Parameters::COV_MODE_LENGTH_QUERY : - case Parameters::COV_MODE_LENGTH_TARGET : - case Parameters::COV_MODE_LENGTH_SHORTER : + default: return 0; } } @@ -72,13 +70,10 @@ class ComplexFilterCriteria { return (tTM >= TMThr); case Parameters::COV_MODE_QUERY: return (qTM >= TMThr); - case Parameters::COV_MODE_LENGTH_QUERY : - case Parameters::COV_MODE_LENGTH_TARGET : - case Parameters::COV_MODE_LENGTH_SHORTER : + default: return true; } - case LocalParameters::FILTER_MODE_INTERFACE: - case LocalParameters::FILTER_MODE_CONFORMATION: + default: return true; } } @@ -93,9 +88,7 @@ class ComplexFilterCriteria { return (alignedTChainTmScores.size()==tChainNum); case Parameters::COV_MODE_QUERY: return (alignedQChainTmScores.size()==qChainNum); - case Parameters::COV_MODE_LENGTH_QUERY : - case Parameters::COV_MODE_LENGTH_TARGET : - case Parameters::COV_MODE_LENGTH_SHORTER : + default: return true; } case LocalParameters::FILTER_MODE_CONFORMATION: @@ -106,12 +99,10 @@ class ComplexFilterCriteria { return (qChainNum>=tChainNum); case Parameters::COV_MODE_QUERY: return (qChainNum<=tChainNum); - case Parameters::COV_MODE_LENGTH_QUERY : - case Parameters::COV_MODE_LENGTH_TARGET : - case Parameters::COV_MODE_LENGTH_SHORTER : + default: return true; } - case LocalParameters::FILTER_MODE_LOOSE: + default: return true; } } @@ -150,14 +141,11 @@ class ComplexFilterCriteria { } } break; - case Parameters::COV_MODE_LENGTH_QUERY : - case Parameters::COV_MODE_LENGTH_TARGET : - case Parameters::COV_MODE_LENGTH_SHORTER : + default: return true; } return true; - case LocalParameters::FILTER_MODE_CONFORMATION: - case LocalParameters::FILTER_MODE_LOOSE: + default: return true; } } @@ -166,8 +154,7 @@ class ComplexFilterCriteria { switch (filterMode){ case LocalParameters::FILTER_MODE_CONFORMATION: //TODO - case LocalParameters::FILTER_MODE_INTERFACE: - case LocalParameters::FILTER_MODE_LOOSE: + default: return true; } } @@ -347,7 +334,7 @@ static void getlookupInfo( lookupDB.close(); } -int filtercomplex(int argc, const char **argv, const Command &command) { +int filtermultimer(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.parseParameters(argc, argv, command, true, 0, 0); const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; From 6494f8a6595d1cd0195048ad541a9faf02eb0cec Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 15 Jul 2024 15:16:53 +0900 Subject: [PATCH 106/160] minor --- src/FoldseekBase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index d089b6bd..9cfb7159 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -377,7 +377,7 @@ std::vector foldseekCommands = { {"createmultimerreport", createmultimerreport, &localPar.createmultimerreport, COMMAND_FORMAT_CONVERSION, "Convert complexDB to tsv format", "# Create output in tsv format (9 columns): qComplexName.c_str(), tComplexName.c_str(), qChainString.c_str(), tChainString.c_str(), qTMScore, tTMScore, u, t, assId\n" - "# (1,2) identifiers for query and target multimer,\n" + "# (1,2) identifiers for query and target multimers,\n" "# (3,4) chains of query multimer and target multimer,\n" "# (5,6) tm score based on query and target residue length,\n" "# (8,9) u and t,\n" From 7e3e4764d188ad400714ff8182fe0e9dca2fe6af Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 18 Jul 2024 14:46:29 +0900 Subject: [PATCH 107/160] Recovery point : saved previous iLDDT implementation --- src/strucclustutils/filtercomplex_origin.cpp | 921 +++++++++++++++++++ src/strucclustutils/filtercomplex_test.cpp | 180 ++++ src/strucclustutils/filtermultimer.cpp | 3 +- 3 files changed, 1102 insertions(+), 2 deletions(-) create mode 100644 src/strucclustutils/filtercomplex_origin.cpp create mode 100644 src/strucclustutils/filtercomplex_test.cpp diff --git a/src/strucclustutils/filtercomplex_origin.cpp b/src/strucclustutils/filtercomplex_origin.cpp new file mode 100644 index 00000000..5d3664f0 --- /dev/null +++ b/src/strucclustutils/filtercomplex_origin.cpp @@ -0,0 +1,921 @@ +#include "DBWriter.h" +#include "Util.h" +#include "LocalParameters.h" +#include "Matcher.h" +#include "Debug.h" +#include "DBReader.h" +#include "IndexReader.h" +#include "FileUtil.h" +#include "MemoryMapped.h" +#include "Coordinate16.h" +#include "tmalign/basic_fun.h" +#include "createcomplexreport.h" +#include "LDDT.h" +#include "CalcProbTP.h" +#include + + +#ifdef OPENMP +#include +#endif + + +unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return (qcov+tcov)/2; + case Parameters::COV_MODE_TARGET: + return qcov; + case Parameters::COV_MODE_QUERY: + return tcov; + case Parameters::COV_MODE_LENGTH_QUERY : + case Parameters::COV_MODE_LENGTH_TARGET : + case Parameters::COV_MODE_LENGTH_SHORTER : + return 0; + } +} + +// bool hasChainnum(bool sameChainNum, int qChainNum, int tChainNum){ +// switch (sameChainNum){ +// case 1: +// if (qChainNum != tChainNum){ +// return false; +// }else{return true;} +// case 0: +// return true; +// } +// } + +bool hasChainTm(float chainTMThr, int covMode, std::vector &qChainTmScores, std::vector &tChainTmScores, unsigned int qChainNum, unsigned int tChainNum) { + if (chainTMThr > 0 ){ + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + if (qChainTmScores.size() coords; + unsigned int coordNum; +}; + +struct ChainForInterface { + std::vector indexVec; + unsigned int chainid; + + ChainForInterface(int idx) + : chainid(idx) {} +}; + +struct Interface{ + std::vector chainsinInterface; +}; + +struct InterfaceForLDDT{ + std::vector chainsinInterface; +}; + +struct Complex { + std::vector InterfaceVec; +}; + +struct ComplexForLDDT { + std::vector InterfaceVec; +}; + +void findInterface(float* qdata, int qLen, unsigned int qChainDbId, float* qdata2, int qLen2, unsigned int qChainDbId2, float distanceThreshold, Interface &interface) { + ChainForInterface qtmpChain(qChainDbId), qtmpChain2(qChainDbId2); + for (int qpos = 0; qpos tposIndex, qposIndex, qIndextIndex; + for (size_t i = 0; i< tChain.indexVec.size(); i++){ + tposIndex[tChain.indexVec[i]] = i; + } + for (size_t i = 0; i< qChain.indexVec.size(); i++){ + qposIndex[qChain.indexVec[i]] = i; + } + int qi = qStartPos; + int ti = tStartPos; + int mi = 0; + for (size_t btPos = 0; btPos < backtrace.size(); btPos++) { + if (backtrace[btPos] == 'M') { + qi++; + ti++; + mi++; + } + else if (backtrace[btPos] == 'I') { + qi++; + } + else { + ti++; + } + if (qposIndex.find(qi) != qposIndex.end()){ + if(tposIndex.find(ti) != tposIndex.end()){ + qIndextIndex[qposIndex[qi]] = tposIndex[ti]; + } + } + } + qNewChain.coordNum = qIndextIndex.size(); + tNewChain.coordNum = qIndextIndex.size(); + for (auto &pair : qIndextIndex){ + qNewChain.coords.push_back(qdata[qChain.indexVec[pair.first]]); + tNewChain.coords.push_back(tdata[tChain.indexVec[pair.second]]); + } + for (auto &pair : qIndextIndex){ + qNewChain.coords.push_back(qdata[qSeqLen + qChain.indexVec[pair.first]]); + tNewChain.coords.push_back(tdata[tSeqLen + tChain.indexVec[pair.second]]); + } + for (auto &pair : qIndextIndex){ + qNewChain.coords.push_back(qdata[2*qSeqLen + qChain.indexVec[pair.first]]); + tNewChain.coords.push_back(tdata[2*tSeqLen + tChain.indexVec[pair.second]]); + } +} + +class ComplexFilterCriteria { +public: + unsigned int dbKey; + unsigned int qTotalAlnLen; + unsigned int tTotalAlnLen; + float qCov; + float tCov; + double qTM; + double tTM; + Complex tComplex; + ComplexForLDDT qnewComplex; + ComplexForLDDT tnewComplex; + std::vector alignedQChainTmScores; + std::vector alignedTChainTmScores; + std::vector intLDDTScores; + ComplexFilterCriteria() {} + ComplexFilterCriteria(unsigned int dbKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qTM, double tTM, double qChainTm, double tChainTm, Complex &tComplex, ComplexForLDDT &qnewComplex, ComplexForLDDT &tnewComplex) : + dbKey(dbKey), qTotalAlnLen(qTotalAlnLen), tTotalAlnLen(tTotalAlnLen), qTM(qTM), tTM(tTM), tComplex(tComplex), qnewComplex(qnewComplex), tnewComplex(tnewComplex) { + alignedQChainTmScores.push_back(qChainTm); + alignedTChainTmScores.push_back(tChainTm); + } + ~ComplexFilterCriteria() { + alignedQChainTmScores.clear(); + alignedTChainTmScores.clear(); + } + + bool hasTM(float TMThr, int covMode, int filterMode){ + switch (filterMode){ + case LocalParameters::FILTER_MODE_INTERFACE: + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return ((qTM>= TMThr) && (tTM >= TMThr)); + case Parameters::COV_MODE_TARGET: + return (tTM >= TMThr); + case Parameters::COV_MODE_QUERY: + return (qTM >= TMThr); + case Parameters::COV_MODE_LENGTH_QUERY : + case Parameters::COV_MODE_LENGTH_TARGET : + case Parameters::COV_MODE_LENGTH_SHORTER : + return true; + } + } + } + + bool hasChainNum(int covMode, int filterMode, int qChainNum, int tChainNum ){ + switch (filterMode){ + case LocalParameters::FILTER_MODE_INTERFACE: + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return (alignedQChainTmScores.size()==qChainNum && qChainNum==tChainNum); + case Parameters::COV_MODE_TARGET: + return (alignedTChainTmScores.size()==tChainNum); + case Parameters::COV_MODE_QUERY: + return (alignedQChainTmScores.size()==qChainNum); + case Parameters::COV_MODE_LENGTH_QUERY : + case Parameters::COV_MODE_LENGTH_TARGET : + case Parameters::COV_MODE_LENGTH_SHORTER : + return true; + } + case LocalParameters::FILTER_MODE_CONFORMATION: + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return (qChainNum==tChainNum); + default: + return true; + } + case LocalParameters::FILTER_MODE_LOOSE: + return true; + + } + } + + // bool hasMatchedCoord(int filterMode){ + // switch (filterMode) { + // case LocalParameters::FILTER_MODE_INTERFACE: + // return (sameCoord); + // case LocalParameters::FILTER_MODE_CONFORMATION: + // case LocalParameters::FILTER_MODE_LOOSE: + // return true; + // } + // } + + void calLDDT( std::map &qIntpostotIntpos ){ + float* qdatatmp1; + float* qdatatmp2; + float* tdatatmp1; + float* tdatatmp2; + + for (auto pair : qIntpostotIntpos){ + size_t seqlength1 = qnewComplex.InterfaceVec[pair.first].chainsinInterface[0].coords.size()/3; + size_t seqlength2 = qnewComplex.InterfaceVec[pair.first].chainsinInterface[1].coords.size()/3; + if (seqlength1 == 0 || seqlength2 == 0){ + continue; + } + qdatatmp1 = qnewComplex.InterfaceVec[pair.first].chainsinInterface[0].coords.data(); + qdatatmp2 = qnewComplex.InterfaceVec[pair.first].chainsinInterface[1].coords.data(); + tdatatmp1 = tnewComplex.InterfaceVec[pair.second].chainsinInterface[0].coords.data(); + tdatatmp2 = tnewComplex.InterfaceVec[pair.second].chainsinInterface[1].coords.data(); + + LDDTCalculator *lddtcalculator = NULL; + lddtcalculator = new LDDTCalculator(seqlength1 + 1, seqlength1 + 1); + lddtcalculator->initQuery(seqlength1, qdatatmp1, &qdatatmp1[seqlength1], &qdatatmp1[2*seqlength1]); + LDDTCalculator::LDDTScoreResult lddtres; + std::string backtrace(seqlength1, 'M'); + lddtres = lddtcalculator->computeLDDTScore(seqlength1, 0, 0, backtrace, tdatatmp1, &tdatatmp1[seqlength1], &tdatatmp1[2*seqlength1]); + double lddtresScore = lddtres.avgLddtScore; + + delete lddtcalculator; + lddtcalculator = new LDDTCalculator(seqlength2 + 1, seqlength2 + 1); + lddtcalculator->initQuery(seqlength2, qdatatmp2, &qdatatmp2[seqlength2], &qdatatmp2[2*seqlength2]); + LDDTCalculator::LDDTScoreResult lddtres2; + std::string backtrace2(seqlength2, 'M'); + lddtres2 = lddtcalculator->computeLDDTScore(seqlength2, 0, 0, backtrace2, tdatatmp2, &tdatatmp2[seqlength2], &tdatatmp2[2*seqlength2]); + lddtresScore += lddtres2.avgLddtScore; + intLDDTScores.push_back(lddtresScore/2); + delete lddtcalculator; + } + } + + bool hasintLDDT (int covMode, int filterMode, float intLDDTThr){ + switch (filterMode){ + case LocalParameters::FILTER_MODE_INTERFACE: + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + // if (intLDDTScores.size()qTotalAlnLen += qTotalAlnLen; + this->tTotalAlnLen += tTotalAlnLen; + this->alignedQChainTmScores.push_back(qChainTm); + this->alignedTChainTmScores.push_back(tChainTm); + } + + void calcCov(unsigned int qLen, unsigned int tLen) { + qCov = static_cast(qTotalAlnLen) / static_cast(qLen); + tCov = static_cast(tTotalAlnLen) / static_cast(tLen); + } +}; + +void fillUArr(const std::string &uString, float (&u)[3][3]) { + std::string tmp; + int i = 0; + int j=0; + const int ulen = static_cast(uString.size()); + for (int k=0; k < ulen; k++) { + if (k==ulen-1) { + u[i][j] = std::stof(tmp); + } else if (uString[k] == ',') { + u[i][j] = std::stof(tmp); + tmp.clear(); + j++; + } else { + tmp.push_back(uString[k]); + } + if (j == 3) { + i++; + j = 0; + } + } +} + +void fillTArr(const std::string &tString, float (&t)[3]) { + std::string tmp; + int i = 0; + const int tlen = static_cast(tString.size()); + for (int k=0; k &ChainKeys) { + unsigned int ResidueLen = 0; + for (auto ChainKey: ChainKeys) { + size_t id = Dbr->sequenceReader->getId(ChainKey); + // Not accessible + if (id == NOT_AVAILABLE_CHAIN_KEY) + return 0; + ResidueLen += Dbr->sequenceReader->getSeqLen(id); + } + return ResidueLen; +} + +static void getlookupInfo( + const std::string &file, + std::map &complexIdtoName, + std::map &chainKeyToComplexIdLookup, + std::map> &complexIdToChainKeysLookup, + std::vector &complexIdVec +) { + if (file.length() == 0) { + return; + } + MemoryMapped lookupDB(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); + char *data = (char *) lookupDB.getData(); + char *end = data + lookupDB.mappedSize(); + const char *entry[255]; + int prevComplexId = -1; + while (data < end && *data != '\0') { + const size_t columns = Util::getWordsOfLine(data, entry, 255); + if (columns < 3) { + Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n"; + continue; + } + auto chainKey = Util::fast_atoi(entry[0]); + std::string chainName(entry[1], (entry[2] - entry[1]) - 1); + auto complexId = Util::fast_atoi(entry[2]); + chainKeyToComplexIdLookup.emplace(chainKey, complexId); + + size_t lastUnderscoreIndex = chainName.find_last_of('_'); + std::string complexName = chainName.substr(0, lastUnderscoreIndex); + + if (complexId != prevComplexId) { + complexIdToChainKeysLookup.emplace(complexId, std::vector()); + complexIdVec.emplace_back(complexId); + complexIdtoName.emplace(complexId, complexName); + prevComplexId = complexId; + } + complexIdToChainKeysLookup.at(complexId).emplace_back(chainKey); + data = Util::skipLine(data); + } + lookupDB.close(); +} + +// static void getInterfaceIndex(float* &data1, int size1, float* &data2, int size2, int interfaceDistThr, std::vector &x, std::vector &y, std::vector &z){ +// unsigned int interLen = 0; +// for (int i =0; i < size1; i ++){ +// for (int j =0; j < size2; j ++){ +// float atomDist = BasicFunction::dist(data1[i], data1[i+size1], data1[i+size1*2], data2[j], data2[j+size2], data2[j+size2*2]); +// if (atomDist<= interfaceDistThr){ +// x.push_back(data1[i]); +// y.push_back(data1[i+size1]); +// z.push_back(data1[i+size1*2]); +// x.push_back(data2[j]); +// y.push_back(data2[j+size2]); +// z.push_back(data2[j+size2*2]); +// interLen ++; +// } +// } +// } +// Coordinates +// } + +// static void getInterface(std::vector &ChainKeys, DBReader &StructDbr, std::vector &interfaceIndexVec, IndexReader* &DBr, unsigned int thread_idx, int interfaceDistThr){ +// for (size_t i = 0; i < ChainKeys.size(); ++i) { +// unsigned int ChainDbId1 = DBr->sequenceReader->getId(ChainKeys[i]); +// char *cadata1 = StructDbr.getData(ChainDbId1, thread_idx); +// size_t CaLength1 = StructDbr.getEntryLen(ChainDbId1); +// size_t seqLength1 = StructDbr.getSeqLen(ChainDbId1); +// Coordinate16 coords1; +// float* data1 = coords1.read(cadata1, seqLength1, CaLength1); +// for (size_t j = i + 1; j < ChainKeys.size(); ++j) { +// unsigned int ChainDbId2 = DBr->sequenceReader->getId(ChainKeys[j]); +// char *cadata2 = StructDbr.getData(ChainDbId2, thread_idx); +// size_t CaLength2 = StructDbr.getEntryLen(ChainDbId2); +// size_t seqLength2 = StructDbr.getSeqLen(ChainDbId2); +// Coordinate16 coords2; +// float* data2 = coords2.read(cadata2, seqLength2, CaLength2); +// std::vector x; +// std::vector y; +// std::vector z; +// getInterfaceIndex(data1, seqLength1, data2, seqLength2, interfaceDistThr, x, y, z); +// interfaceIndexVec.push_back(interfaceCoord); +// } +// } +// } + +int filtercomplex(int argc, const char **argv, const Command &command) { + LocalParameters &par = LocalParameters::getLocalInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; + const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); + int dbaccessMode = (DBReader::USE_INDEX); + + IndexReader* qDbr = NULL; + qDbr = new IndexReader(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + DBReader qStructDbr((par.db1 + "_ca").c_str(), (par.db1 + "_ca.index").c_str(), + par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + qStructDbr.open(DBReader::NOSORT); + + IndexReader* tDbr = NULL; + DBReader *tStructDbr = NULL; + if (sameDB) { + tDbr = qDbr; + tStructDbr = &qStructDbr; + } + else{ + tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + tStructDbr = new DBReader((par.db2 + "_ca").c_str(), (par.db2 + "_ca.index").c_str(), + par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + tStructDbr->open(DBReader::NOSORT); + } + DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX| DBReader::USE_DATA); + alnDbr.open(DBReader::LINEAR_ACCCESS); + size_t localThreads = 1; + + // Debug(Debug::WARNING) << "Monomer will be treated as singleton\nMonomer chain key: \n"; +#ifdef OPENMP +localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); +#endif + const bool shouldCompress = (par.compressed == true); + const int db4Type = Parameters::DBTYPE_CLUSTER_RES; + + DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), 1, shouldCompress, db4Type); + resultWriter.open(); + + const int db5Type = Parameters::DBTYPE_GENERIC_DB; + //TODO: remove resultWrite5 when done + DBWriter resultWrite5(par.db5.c_str(), par.db5Index.c_str(), 1, shouldCompress, db5Type); + resultWrite5.open(); + + std::string qLookupFile = par.db1 + ".lookup"; + std::string tLookupFile = par.db2 + ".lookup"; + + chainKeyToComplexId_t qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; + complexIdToChainKeys_t qComplexIdToChainKeyMap, tComplexIdToChainKeyMap; + std::map qcomplexIdToName, tcomplexIdToName; + std::vector qComplexIdVec, tComplexIdVec; + getlookupInfo(qLookupFile, qcomplexIdToName,qChainKeyToComplexIdMap, qComplexIdToChainKeyMap, qComplexIdVec); + getlookupInfo(tLookupFile, tcomplexIdToName, tChainKeyToComplexIdMap, tComplexIdToChainKeyMap, tComplexIdVec); + qChainKeyToComplexIdMap.clear(); + // Debug::Progress progress(qComplexIdVec.size()); + std::map qComplexLength, tComplexLength; + std::map qComplexIdResult; + + for (size_t tComplexIdx = 0; tComplexIdx < tComplexIdVec.size(); tComplexIdx++) { + unsigned int tComplexId = tComplexIdVec[tComplexIdx]; + std::vector &tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); + if (tChainKeys.empty()) { + continue; + } + unsigned int reslen = getComplexResidueLength(tDbr, tChainKeys); + tComplexLength[tComplexId] =reslen; + } + for (size_t qComplexIdx = 0; qComplexIdx < qComplexIdVec.size(); qComplexIdx++) { + unsigned int qComplexId = qComplexIdVec[qComplexIdx]; + std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); + if (qChainKeys.empty()) { + continue; + } + unsigned int reslen = getComplexResidueLength(qDbr, qChainKeys); + qComplexLength[qComplexId] = reslen; + } + + + +#pragma omp parallel num_threads(localThreads) + { + resultToWrite_t result5; + char buffer[32]; + unsigned int thread_idx = 0; + //TODO: set threshold for interface. now 15A + int interfaceDistThr =15; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); +#endif + std::string result; + std::map localComplexMap; + std::vector assIdsToDelete; + std::map> cmplIdToBestAssId; // cmplId : [assId, alnSum] + std::vector selectedAssIDs; + Matcher::result_t res; + std::map> assIDtoqChainIdtotChainId; + std::map> assIdtoqIntpostotIntpos; + +#pragma omp for schedule(dynamic, 1) + for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { + Coordinate16 qcoords; + Coordinate16 tcoords; + // progress.updateProgress(); + unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; + std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); + + // std::vector> qInterfaceIndexVec; + // getInterface(qChainKeys, qStructDbr, qInterfaceIndexVec, qDbr, thread_idx, interfaceDistThr); + Complex qComplex; + for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ){ + unsigned int qChainKey = qChainKeys[qChainIdx]; + unsigned int qChainAlnId = alnDbr.getId(qChainKey); + //TODO: if monomer + if (qChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ + continue; + } + unsigned int qChainDbId = qDbr->sequenceReader->getId(qChainKey); + char *data = alnDbr.getData(qChainAlnId, thread_idx); + char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); + size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); + size_t qSeqlen = qDbr->sequenceReader->getSeqLen(qChainDbId); + float* qdata = qcoords.read(qcadata, qSeqlen, qCaLength); + while (*data != '\0' ) { + ComplexDataHandler retComplex = parseScoreComplexResult(data, res); + data = Util::skipLine(data); + unsigned int assId = retComplex.assId; + unsigned int tChainKey= res.dbKey; + unsigned int tChainAlnId = alnDbr.getId(tChainKey); + unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); + if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ + continue; + } + assIDtoqChainIdtotChainId[assId][qChainDbId] = tChainDbId ; + } + for (size_t qChainIdx2 = qChainIdx+1; qChainIdx2 < qChainKeys.size(); qChainIdx2++ ){ + unsigned int qChainKey2 = qChainKeys[qChainIdx2]; + unsigned int qChainAlnId2 = alnDbr.getId(qChainKey2); + unsigned int qChainDbId2 = qDbr->sequenceReader->getId(qChainKey2); + char *qcadata2 = qStructDbr.getData(qChainDbId2, thread_idx); + size_t qCaLength2 = qStructDbr.getEntryLen(qChainDbId2); + size_t qSeqlen2 = qDbr->sequenceReader->getSeqLen(qChainDbId2); + float* qdata2 = qcoords.read(qcadata2, qSeqlen2, qCaLength2); + Interface interface; + findInterface(qdata, qSeqlen, qChainDbId, qdata2, qSeqlen2, qChainDbId2, interfaceDistThr, interface); + if (interface.chainsinInterface[0].indexVec.size() > 0){ + qComplex.InterfaceVec.push_back(interface); + } + } + } + for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { + unsigned int qChainKey = qChainKeys[qChainIdx]; + unsigned int qChainAlnId = alnDbr.getId(qChainKey); + unsigned int qChainDbId = qDbr->sequenceReader->getId(qChainKey); + //handling monomer as singleton + if (qChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ + char *outpos = Itoa::u32toa_sse2(qComplexId, buffer); + result.append(buffer, (outpos - buffer - 1)); + result.push_back('\n'); + result5.append(qcomplexIdToName.at(qComplexId) + "\t" + tcomplexIdToName.at(qComplexId) + "\t1.000000\t1.000000\t1.000000\t1.000000\n"); + break; + } + + char *data = alnDbr.getData(qChainAlnId, thread_idx); + while (*data != '\0' ) { + ComplexDataHandler retComplex = parseScoreComplexResult(data, res); + char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); + size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); + float* qdata = qcoords.read(qcadata, res.qLen, qCaLength); + unsigned int qSeqlen = qDbr->sequenceReader->getSeqLen(qChainDbId); + if (!retComplex.isValid){ + Debug(Debug::ERROR) << "No scorecomplex result provided"; + EXIT(EXIT_FAILURE); + } + + data = Util::skipLine(data); + unsigned int assId = retComplex.assId; + unsigned int tChainKey= res.dbKey; + unsigned int tChainAlnId = alnDbr.getId(tChainKey); + unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); + //if target is monomer, break to be singleton + if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ + continue; + } + unsigned int tComplexId = tChainKeyToComplexIdMap.at(tChainKey); + std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); + float u[3][3]; + float t[3]; + fillUArr(retComplex.uString, u); + fillTArr(retComplex.tString, t); + char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); + size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); + unsigned int tSeqlen = tDbr->sequenceReader->getSeqLen(tChainDbId); + float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); + unsigned int alnLen = cigarToAlignedLength(res.backtrace); + Coordinates qm(alnLen), tm(alnLen); + fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); + double chainTm = computeChainTmScore(qm, tm, t, u, alnLen, res.dbLen); + double qChainTm = chainTm / res.qLen; + double tChainTm = chainTm/ res.dbLen; + unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + if (localComplexMap.find(assId) == localComplexMap.end()) { + Complex tComplex; + for (size_t tChainIdx = 0; tChainIdx < tChainKeys.size(); tChainIdx++ ){ + unsigned int tChainKey = tChainKeys[tChainIdx]; + unsigned int tChainAlnId = alnDbr.getId(tChainKey); + unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); + //TODO: if monomer + if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ + break; + } + + char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); + size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); + float* tdata = tcoords.read(tcadata, tSeqlen, tCaLength); + + for (size_t tChainIdx2 = tChainIdx+1; tChainIdx2 < tChainKeys.size(); tChainIdx2++ ){ + unsigned int tChainKey2 = tChainKeys[tChainIdx2]; + unsigned int tChainAlnId2 = alnDbr.getId(tChainKey2); + unsigned int tChainDbId2 = tDbr->sequenceReader->getId(tChainKey2); + char *tcadata2 = tStructDbr->getData(tChainDbId2, thread_idx); + size_t tCaLength2 = tStructDbr->getEntryLen(tChainDbId2); + size_t tSeqlen2 = tDbr->sequenceReader->getSeqLen(tChainDbId2); + float* tdata2 = tcoords.read(tcadata2, tSeqlen2, tCaLength2); + Interface interface; + findInterface(tdata, tSeqlen, tChainDbId, tdata2, tSeqlen2, tChainDbId2, interfaceDistThr, interface); + if (interface.chainsinInterface[0].indexVec.size() > 0){ + tComplex.InterfaceVec.push_back(interface); + } + } + } + + ComplexForLDDT qnewComplex, tnewComplex; + for(size_t intnum=0 ; intnum <= qComplex.InterfaceVec.size(); intnum ++){ + InterfaceForLDDT inter; + qnewComplex.InterfaceVec.push_back(inter); + } + for(size_t intnum=0 ; intnum <= tComplex.InterfaceVec.size(); intnum ++){ + InterfaceForLDDT inter; + tnewComplex.InterfaceVec.push_back(inter); + } + for (size_t qIntpos = 0; qIntpos < qComplex.InterfaceVec.size(); qIntpos++){ + for (size_t tIntpos = 0; tIntpos < tComplex.InterfaceVec.size(); tIntpos++){ + if (assIDtoqChainIdtotChainId[assId].find(qComplex.InterfaceVec[qIntpos].chainsinInterface[0].chainid)!= assIDtoqChainIdtotChainId[assId].end() && assIDtoqChainIdtotChainId[assId].find(qComplex.InterfaceVec[qIntpos].chainsinInterface[1].chainid)!= assIDtoqChainIdtotChainId[assId].end()){ + if (tComplex.InterfaceVec[tIntpos].chainsinInterface[0].chainid == assIDtoqChainIdtotChainId[assId][qComplex.InterfaceVec[qIntpos].chainsinInterface[0].chainid] && tComplex.InterfaceVec[tIntpos].chainsinInterface[1].chainid == assIDtoqChainIdtotChainId[assId][qComplex.InterfaceVec[qIntpos].chainsinInterface[1].chainid]){ + assIdtoqIntpostotIntpos[assId][qIntpos] = tIntpos ; + } + else if (tComplex.InterfaceVec[tIntpos].chainsinInterface[1].chainid == assIDtoqChainIdtotChainId[assId][qComplex.InterfaceVec[qIntpos].chainsinInterface[0].chainid] && tComplex.InterfaceVec[tIntpos].chainsinInterface[0].chainid == assIDtoqChainIdtotChainId[assId][qComplex.InterfaceVec[qIntpos].chainsinInterface[1].chainid]) { + assIdtoqIntpostotIntpos[assId][qIntpos] = tIntpos ; + } + } + } + } + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm, tComplex, qnewComplex, tnewComplex); + localComplexMap[assId] = cmplfiltcrit; + } else { + localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm); + } + for (auto pair : assIdtoqIntpostotIntpos[assId]){ + for (ChainForInterface qChain : qComplex.InterfaceVec[pair.first].chainsinInterface){ + for (ChainForInterface tChain : localComplexMap.at(assId).tComplex.InterfaceVec[pair.second].chainsinInterface){ + if(qChain.chainid == qChainDbId && tChain.chainid == tChainDbId){ + ChainForLDDT qChaintmp, tChaintmp; + AlignedInterface(qdata, tdata, qSeqlen, tSeqlen, qChain, tChain, res.backtrace, res.qStartPos, res.dbStartPos, qChaintmp, tChaintmp); + localComplexMap.at(assId).qnewComplex.InterfaceVec[pair.first].chainsinInterface.push_back(qChaintmp); + localComplexMap.at(assId).tnewComplex.InterfaceVec[pair.second].chainsinInterface.push_back(tChaintmp); + } + } + } + } + } // while end + } + for (auto& assId_res : localComplexMap){ + unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); + assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); + if (!assIdtoqIntpostotIntpos[assId_res.first].empty()){ + assId_res.second.calLDDT(assIdtoqIntpostotIntpos[assId_res.first]); + } + std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); + if (!assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr, par.intLDDTThr, qChainKeys.size(), tChainKeys.size())){ + assIdsToDelete.push_back(assId_res.first); + } + } + + for (const auto& key : assIdsToDelete) { + localComplexMap.erase(key); + } + + for (const auto& assId_res : localComplexMap){ + unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); + unsigned int alnlen = adjustAlnLen(assId_res.second.qTotalAlnLen, assId_res.second.tTotalAlnLen, par.covMode); + if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()){ + cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + } + else { + if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]){ + cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + } + } + } + + for (const auto& pair : cmplIdToBestAssId){ + selectedAssIDs.push_back(pair.second[0]); + } + + for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++){ + unsigned int assId = selectedAssIDs[assIdidx]; + unsigned int tComplexId = tChainKeyToComplexIdMap.at(localComplexMap.at(assId).dbKey); + char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); + result.append(buffer, (outpos - buffer - 1)); + result.push_back('\n'); + result5.append(qcomplexIdToName.at(qComplexId) + "\t" + tcomplexIdToName.at(tComplexId) + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); + } + #pragma omp critical + { + qComplexIdResult[qComplexId]= result; + } + result.clear(); + localComplexMap.clear(); + assIdsToDelete.clear(); + cmplIdToBestAssId.clear(); + selectedAssIDs.clear(); + assIdtoqIntpostotIntpos.clear(); + assIDtoqChainIdtotChainId.clear(); + } // for end + #pragma omp critical + { + resultWrite5.writeData(result5.c_str(), result5.length(), 0); + } + result5.clear(); + } // MP end + for (auto &pair : qComplexIdResult){ + resultWriter.writeData(pair.second.c_str(), pair.second.length(), pair.first); + } + + resultWriter.close(true); + resultWrite5.close(par.dbOut == false); + qStructDbr.close(); + alnDbr.close(); + delete qDbr; + if (sameDB == false) { + delete tDbr; + delete tStructDbr; + } + qChainKeyToComplexIdMap.clear(); + tChainKeyToComplexIdMap.clear(); + qComplexIdToChainKeyMap.clear(); + tComplexIdToChainKeyMap.clear(); + qcomplexIdToName.clear(); + tcomplexIdToName.clear(); + qComplexIdVec.clear(); + tComplexIdVec.clear(); + qComplexLength.clear(); + tComplexLength.clear(); + + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/src/strucclustutils/filtercomplex_test.cpp b/src/strucclustutils/filtercomplex_test.cpp new file mode 100644 index 00000000..c4224cce --- /dev/null +++ b/src/strucclustutils/filtercomplex_test.cpp @@ -0,0 +1,180 @@ +#include "DBWriter.h" +#include "DBReader.h" +#include "IndexReader.h" +#include "createcomplexreport.h" +#include "LocalParameters.h" +#include "Coordinate16.h" + +// #include "Util.h" +// #include "Matcher.h" +// #include "Debug.h" +// #include "FileUtil.h" +// #include "MemoryMapped.h" +// #include "tmalign/basic_fun.h" +// #include "LDDT.h" +// #include "CalcProbTP.h" +// #include + +#ifdef OPENMP +#include +#endif + +struct Complex { + int complexId; + std::string complexName; + + unsigned int nChain; + std::vector chainKeys; + + unsigned int complexLength; + + // Coordinate16 Coords; +} + +static void getlookupInfo( + const std::string &file, + std::vector &complexes, + std::map &chainKeyToComplexIdLookup, + std::map &complexIdtoIdx + // std::map> &complexIdToChainKeysLookup, + // std::map &complexIdtoName, + // std::vector &complexIdVec +) { + if (file.length() == 0) { + return; + } + MemoryMapped lookupDB(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); + char *data = (char *) lookupDB.getData(); + char *end = data + lookupDB.mappedSize(); + const char *entry[255]; + int prevComplexId = -1; + int nComplex = 0; + while (data < end && *data != '\0') { + const size_t columns = Util::getWordsOfLine(data, entry, 255); + if (columns < 3) { + Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n"; + continue; + } + auto chainKey = Util::fast_atoi(entry[0]); + auto complexId = Util::fast_atoi(entry[2]); + chainKeyToComplexIdLookup.emplace(chainKey, complexId); + + if (complexId != prevComplexId) { + std::string chainName(entry[1], (entry[2] - entry[1]) - 1); + size_t lastUnderscoreIndex = chainName.find_last_of('_'); + std::string complexName = chainName.substr(0, lastUnderscoreIndex); + + Complex complex; + complex.complexId = complexId; + complex.nChain = 1; + complex.complexName = complexName; + complex.chainKeys.emplace_back(chainKey); + complexes.emplace_back(complex); + complexIdtoIdx.emplace(complexId, nComplex); + + prevComplexId = complexId; + nComplex++; + } + else { + complexes.back().nChain++; + complexes.back().chainKeys.emplace_back(chainKey); + } + + data = Util::skipLine(data); + } + lookupDB.close(); +} + +static void sumComplexLength (DBReader &structDbr, std::vector &complexes) { + // Fill in the complex length + for (size_t complexIdx = 0; complexIdx < complexes.size(); complexIdx++) { + Complex &cmpl = complexes[complexIdx]; + if (cmpl.chainKeys.size() == 0) { + continue; + } + unsigned int cmplId = cmpl.complexId; + unsigned int cmplLen = 0; + for (size_t chainIdx = 0; chainIdx < cmpl.chainKeys.size(); chainIdx++) { + unsigned int chainKey = cmpl.chainKeys[chainIdx]; + structDbr.get(chainKey); + cmplLen += structDbr.getSequenceLength(); + } + cmpl.complexLength = cmplLen; + } + +} + +int filtercomplex(int argc, const char **argv, const Command &command) { + LocalParameters &par = LocalParameters::getLocalInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; + const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); + int dbaccessMode = (DBReader::USE_INDEX); + + IndexReader* qDbr = NULL; + qDbr = new IndexReader(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + DBReader qStructDbr((par.db1 + "_ca").c_str(), (par.db1 + "_ca.index").c_str(), + par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + qStructDbr.open(DBReader::NOSORT); + + IndexReader* tDbr = NULL; + DBReader *tStructDbr = NULL; + if (sameDB) { + tDbr = qDbr; + tStructDbr = &qStructDbr; + } + else{ + tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + tStructDbr = new DBReader((par.db2 + "_ca").c_str(), (par.db2 + "_ca.index").c_str(), + par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + tStructDbr->open(DBReader::NOSORT); + } + + DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX| DBReader::USE_DATA); + alnDbr.open(DBReader::LINEAR_ACCCESS); + size_t localThreads = 1; + +#ifdef OPENMP +localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); +#endif + + const bool shouldCompress = (par.compressed == true); + const int db4Type = Parameters::DBTYPE_CLUSTER_RES; + + DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), 1, shouldCompress, db4Type); + resultWriter.open(); + + //TODO: remove resultWrite5 when done + const int db5Type = Parameters::DBTYPE_GENERIC_DB; + DBWriter resultWrite5(par.db5.c_str(), par.db5Index.c_str(), 1, shouldCompress, db5Type); + resultWrite5.open(); + + std::string qLookupFile = par.db1 + ".lookup"; + std::string tLookupFile = par.db2 + ".lookup"; + + chainKeyToComplexId_t qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; + // complexIdToChainKeys_t qComplexIdToChainKeyMap, tComplexIdToChainKeyMap; + // std::map qcomplexIdToName, tcomplexIdToName; + // std::vector qComplexIdVec, tComplexIdVec; + std::vector qComplexes, tComplexes; + std::map qComplexIdtoIdx, tComplexIdtoIdx; + // getlookupInfo(qLookupFile, qcomplexIdToName,qChainKeyToComplexIdMap, qComplexIdToChainKeyMap, qComplexIdVec); + // std::map qComplexLength, tComplexLength; + // std::map qComplexIdResult; + getlookupInfo(qLookupFile, qComplexes, qChainKeyToComplexIdMap, qComplexIdtoIdx); + + // Fill in the complex length + sumComplexLength(qStructDbr, qComplexes); + + if (sameDB) { + tComplexes = qComplexes; + tChainKeyToComplexIdMap = qChainKeyToComplexIdMap; + tComplexIdtoIdx = qComplexIdtoIdx; + } + else { + getlookupInfo(tLookupFile, tComplexes, tChainKeyToComplexIdMap, tComplexIdtoIdx); + sumComplexLength(tStructDbr, tComplexes); + } + + return EXIT_SUCCESS; +} diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 0be890f7..9733cb35 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -10,8 +10,7 @@ #include "Coordinate16.h" #include "tmalign/basic_fun.h" #include "MultimerUtil.h" -#include "LDDT.h" -#include "CalcProbTP.h" +// #include "LDDT.h" #include #ifdef OPENMP From 3922544e12bf524e6aa356fde7e882c6d28b185d Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 18 Jul 2024 14:53:34 +0900 Subject: [PATCH 108/160] Inactivated filter-mode param: chainNum & conformation is affected --- src/strucclustutils/filtermultimer.cpp | 158 +++++++++++-------------- 1 file changed, 70 insertions(+), 88 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 9733cb35..369e2e4f 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -60,111 +60,93 @@ class ComplexFilterCriteria { } bool hasTM(float TMThr, int covMode, int filterMode){ - switch (filterMode){ - case LocalParameters::FILTER_MODE_LOOSE: - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - return ((qTM>= TMThr) && (tTM >= TMThr)); - case Parameters::COV_MODE_TARGET: - return (tTM >= TMThr); - case Parameters::COV_MODE_QUERY: - return (qTM >= TMThr); - default: - return true; - } + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + return ((qTM>= TMThr) && (tTM >= TMThr)); + case Parameters::COV_MODE_TARGET: + return (tTM >= TMThr); + case Parameters::COV_MODE_QUERY: + return (qTM >= TMThr); default: return true; } } - bool hasChainNum(int covMode, int filterMode, size_t qChainNum, size_t tChainNum ){ - switch (filterMode){ - case LocalParameters::FILTER_MODE_INTERFACE: - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - return (alignedQChainTmScores.size()==qChainNum && qChainNum==tChainNum); - case Parameters::COV_MODE_TARGET: - return (alignedTChainTmScores.size()==tChainNum); - case Parameters::COV_MODE_QUERY: - return (alignedQChainTmScores.size()==qChainNum); - default: - return true; - } - case LocalParameters::FILTER_MODE_CONFORMATION: - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - return (qChainNum==tChainNum); - case Parameters::COV_MODE_TARGET: - return (qChainNum>=tChainNum); - case Parameters::COV_MODE_QUERY: - return (qChainNum<=tChainNum); - default: - return true; - } - default: - return true; - } - } + // bool hasChainNum(int covMode, int filterMode, size_t qChainNum, size_t tChainNum ){ + // switch (filterMode){ + // case LocalParameters::FILTER_MODE_INTERFACE: + // switch (covMode) { + // case Parameters::COV_MODE_BIDIRECTIONAL: + // return (alignedQChainTmScores.size()==qChainNum && qChainNum==tChainNum); + // case Parameters::COV_MODE_TARGET: + // return (alignedTChainTmScores.size()==tChainNum); + // case Parameters::COV_MODE_QUERY: + // return (alignedQChainTmScores.size()==qChainNum); + // default: + // return true; + // } + // case LocalParameters::FILTER_MODE_CONFORMATION: + // switch (covMode) { + // case Parameters::COV_MODE_BIDIRECTIONAL: + // return (qChainNum==tChainNum); + // case Parameters::COV_MODE_TARGET: + // return (qChainNum>=tChainNum); + // case Parameters::COV_MODE_QUERY: + // return (qChainNum<=tChainNum); + // default: + // return true; + // } + // default: + // return true; + // } + // } bool hasChainTm(float chainTMThr, int covMode, int filterMode, unsigned int qChainNum, unsigned int tChainNum) { - switch (filterMode){ - case LocalParameters::FILTER_MODE_INTERFACE: - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - if (alignedQChainTmScores.size() Date: Sun, 21 Jul 2024 00:02:12 +0900 Subject: [PATCH 109/160] Made Complex struct and implementation is in progress --- src/strucclustutils/filtermultimer.cpp | 137 ++++++++++++++----------- 1 file changed, 77 insertions(+), 60 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 369e2e4f..ba9d84ef 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -17,6 +17,19 @@ #include #endif +struct Complex { + int complexId; + std::string complexName; + unsigned int nChain; + std::vector chainKeys; + + unsigned int complexLength; + + // Coordinate16 Coords; + + Complex() : complexId(0), complexName(""), nChain(0), complexLength(0) {} +}; + unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: @@ -262,24 +275,31 @@ double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u return tmscore; } -unsigned int getComplexResidueLength( IndexReader *Dbr, std::vector &ChainKeys) { - unsigned int ResidueLen = 0; - for (auto ChainKey: ChainKeys) { - size_t id = Dbr->sequenceReader->getId(ChainKey); +void getComplexResidueLength( IndexReader *Dbr, std::vector &complexes) { + for (size_t complexIdx = 0; complexIdx < complexes.size(); complexIdx++) { + Complex *complex = &complexes[complexIdx]; + unsigned int complexId = complex->complexId; + std::vector &chainKeys = complex->chainKeys; + if (chainKeys.empty()) { + continue; + } + unsigned int reslen = 0; + for (auto chainKey: chainKeys) { + size_t id = Dbr->sequenceReader->getId(chainKey); // Not accessible if (id == NOT_AVAILABLE_CHAIN_KEY) - return 0; - ResidueLen += Dbr->sequenceReader->getSeqLen(id); + continue; + reslen += Dbr->sequenceReader->getSeqLen(id); } - return ResidueLen; + complex->complexLength = reslen; + } } static void getlookupInfo( const std::string &file, - std::map &complexIdtoName, std::map &chainKeyToComplexIdLookup, - std::map> &complexIdToChainKeysLookup, - std::vector &complexIdVec + std::vector &complexes, + std::map &complexIdtoIdx ) { if (file.length() == 0) { return; @@ -288,7 +308,9 @@ static void getlookupInfo( char *data = (char *) lookupDB.getData(); char *end = data + lookupDB.mappedSize(); const char *entry[255]; + int prevComplexId = -1; + int nComplex = 0; while (data < end && *data != '\0') { const size_t columns = Util::getWordsOfLine(data, entry, 255); if (columns < 3) { @@ -296,20 +318,26 @@ static void getlookupInfo( continue; } auto chainKey = Util::fast_atoi(entry[0]); - std::string chainName(entry[1], (entry[2] - entry[1]) - 1); auto complexId = Util::fast_atoi(entry[2]); chainKeyToComplexIdLookup.emplace(chainKey, complexId); + std::string chainName(entry[1], (entry[2] - entry[1]) - 1); size_t lastUnderscoreIndex = chainName.find_last_of('_'); std::string complexName = chainName.substr(0, lastUnderscoreIndex); if (complexId != prevComplexId) { - complexIdToChainKeysLookup.emplace(complexId, std::vector()); - complexIdVec.emplace_back(complexId); - complexIdtoName.emplace(complexId, complexName); + + Complex complex; + complex.complexId = complexId; + complex.complexName = complexName; + complexIdtoIdx.emplace(complexId, nComplex); + complexes.emplace_back(complex); + prevComplexId = complexId; + nComplex++; } - complexIdToChainKeysLookup.at(complexId).emplace_back(chainKey); + complexes.back().chainKeys.emplace_back(chainKey); + complexes.back().nChain++; data = Util::skipLine(data); } lookupDB.close(); @@ -361,37 +389,23 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t std::string qLookupFile = par.db1 + ".lookup"; std::string tLookupFile = par.db2 + ".lookup"; + std::vector qComplexes, tComplexes; + std::map qComplexIdToIdx, tComplexIdToIdx; chainKeyToComplexId_t qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; - complexIdToChainKeys_t qComplexIdToChainKeyMap, tComplexIdToChainKeyMap; - std::map qcomplexIdToName, tcomplexIdToName; - std::vector qComplexIdVec, tComplexIdVec; - getlookupInfo(qLookupFile, qcomplexIdToName,qChainKeyToComplexIdMap, qComplexIdToChainKeyMap, qComplexIdVec); - getlookupInfo(tLookupFile, tcomplexIdToName, tChainKeyToComplexIdMap, tComplexIdToChainKeyMap, tComplexIdVec); - qChainKeyToComplexIdMap.clear(); - Debug::Progress progress(qComplexIdVec.size()); - std::map qComplexLength, tComplexLength; + + getlookupInfo(qLookupFile, qChainKeyToComplexIdMap, qComplexes, qComplexIdToIdx); + getComplexResidueLength(qDbr, qComplexes); + Debug::Progress progress(qComplexes.size()); std::map qComplexIdResult; - for (size_t tComplexIdx = 0; tComplexIdx < tComplexIdVec.size(); tComplexIdx++) { - unsigned int tComplexId = tComplexIdVec[tComplexIdx]; - std::vector &tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); - if (tChainKeys.empty()) { - continue; - } - unsigned int reslen = getComplexResidueLength(tDbr, tChainKeys); - tComplexLength[tComplexId] =reslen; - } - for (size_t qComplexIdx = 0; qComplexIdx < qComplexIdVec.size(); qComplexIdx++) { - unsigned int qComplexId = qComplexIdVec[qComplexIdx]; - std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); - if (qChainKeys.empty()) { - continue; - } - unsigned int reslen = getComplexResidueLength(qDbr, qChainKeys); - qComplexLength[qComplexId] = reslen; + if (sameDB) { + tChainKeyToComplexIdMap = qChainKeyToComplexIdMap; + tComplexes = qComplexes; + tComplexIdToIdx = qComplexIdToIdx; + } else { + getlookupInfo(tLookupFile, tChainKeyToComplexIdMap, tComplexes, tComplexIdToIdx); + getComplexResidueLength(tDbr, tComplexes); } - - #pragma omp parallel num_threads(localThreads) { @@ -412,11 +426,14 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t Matcher::result_t res; #pragma omp for schedule(dynamic, 1) - for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { + for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexes.size(); queryComplexIdx++) { + // DOING progress.updateProgress(); - unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; - std::vector qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); - + + Complex qComplex = qComplexes[queryComplexIdx]; + unsigned int qComplexId = qComplex.complexId; + std::vector qChainKeys = qComplex.chainKeys; + for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { unsigned int qChainKey = qChainKeys[qChainIdx]; unsigned int qChainAlnId = alnDbr.getId(qChainKey); @@ -426,7 +443,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t char *outpos = Itoa::u32toa_sse2(qComplexId, buffer); result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); - result5.append(qcomplexIdToName.at(qComplexId) + "\t" + tcomplexIdToName.at(qComplexId) + "\t1.000000\t1.000000\t1.000000\t1.000000\n"); + result5.append(qComplex.complexName + "\t" + tComplexes[queryComplexIdx].complexName + "\t1.000000\t1.000000\t1.000000\t1.000000\n"); break; } @@ -452,7 +469,8 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); unsigned int tComplexId = tChainKeyToComplexIdMap.at(tChainKey); - std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); + unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); + std::vector tChainKeys = tComplexes[tComplexIdx].chainKeys; float u[3][3]; float t[3]; @@ -483,9 +501,11 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } for (auto& assId_res : localComplexMap){ unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); - std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); - assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); - if (!(assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, qChainKeys.size(), tChainKeys.size()))){ + unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); + Complex tComplex = tComplexes[tComplexIdx]; + std::vector tChainKeys = tComplex.chainKeys; + assId_res.second.calcCov(qComplex.complexLength, tComplex.complexLength); + if (!(assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, qComplex.nChain, tComplex.nChain))){ assIdsToDelete.push_back(assId_res.first); } } @@ -514,10 +534,13 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++){ unsigned int assId = selectedAssIDs[assIdidx]; unsigned int tComplexId = tChainKeyToComplexIdMap.at(localComplexMap.at(assId).dbKey); + unsigned int tComplexIdIdx = tComplexIdToIdx.at(tComplexId); + Complex tComplex = tComplexes[tComplexIdIdx]; + char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); - result5.append(qcomplexIdToName.at(qComplexId) + "\t" + tcomplexIdToName.at(tComplexId) + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); + result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); } #pragma omp critical { @@ -551,14 +574,8 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } qChainKeyToComplexIdMap.clear(); tChainKeyToComplexIdMap.clear(); - qComplexIdToChainKeyMap.clear(); - tComplexIdToChainKeyMap.clear(); - qcomplexIdToName.clear(); - tcomplexIdToName.clear(); - qComplexIdVec.clear(); - tComplexIdVec.clear(); - qComplexLength.clear(); - tComplexLength.clear(); + qComplexes.clear(); + tComplexes.clear(); return EXIT_SUCCESS; } \ No newline at end of file From cdf6e78649cbebf256cf3c0b40a9dee5018e1f8a Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Sun, 21 Jul 2024 15:30:30 +0900 Subject: [PATCH 110/160] minor, input->query --- data/easymultimercluster.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/data/easymultimercluster.sh b/data/easymultimercluster.sh index 227519f6..63ca71fd 100644 --- a/data/easymultimercluster.sh +++ b/data/easymultimercluster.sh @@ -80,19 +80,19 @@ postprocessFasta() { ' "${1}" > "${1}.tmp" && mv "${1}.tmp" "${1}" } -if notExists "${TMP_PATH}/input.dbtype"; then +if notExists "${TMP_PATH}/query.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/input" ${CREATEDB_PAR} \ - || fail "input createdb died" + "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/query" ${CREATEDB_PAR} \ + || fail "query createdb died" fi if notExists "${TMP_PATH}/complex_clu.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" multimercluster "${TMP_PATH}/input" "${TMP_PATH}/complex_clu" "${TMP_PATH}" ${MULTIMERCLUSTER_PAR} \ + "$MMSEQS" multimercluster "${TMP_PATH}/query" "${TMP_PATH}/complex_clu" "${TMP_PATH}" ${MULTIMERCLUSTER_PAR} \ || fail "Multimercluster died" fi -SOURCE="${TMP_PATH}/input" +SOURCE="${TMP_PATH}/query" INPUT="${TMP_PATH}/latest/complex_db" if notExists "${TMP_PATH}/cluster.tsv"; then # shellcheck disable=SC2086 @@ -142,17 +142,17 @@ if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/query" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/query_h" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${INPUT}" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${INPUT}_h" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input_ca" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/query_ca" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/input_ss" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/query_ss" ${VERBOSITY_PAR} rm "${TMP_PATH}/rep_seqs.list" rm -rf "${TMP_PATH}/latest" rm -f "${TMP_PATH}/easymultimercluster.sh" From 3123bae1da6d4884a375fbc080affe89bbc99dbe Mon Sep 17 00:00:00 2001 From: rachelse Date: Mon, 22 Jul 2024 17:06:38 +0900 Subject: [PATCH 111/160] Removed redundant loops and improved performance --- src/strucclustutils/filtercomplex_test.cpp | 180 --------------------- src/strucclustutils/filtermultimer.cpp | 91 ++++++----- 2 files changed, 52 insertions(+), 219 deletions(-) delete mode 100644 src/strucclustutils/filtercomplex_test.cpp diff --git a/src/strucclustutils/filtercomplex_test.cpp b/src/strucclustutils/filtercomplex_test.cpp deleted file mode 100644 index c4224cce..00000000 --- a/src/strucclustutils/filtercomplex_test.cpp +++ /dev/null @@ -1,180 +0,0 @@ -#include "DBWriter.h" -#include "DBReader.h" -#include "IndexReader.h" -#include "createcomplexreport.h" -#include "LocalParameters.h" -#include "Coordinate16.h" - -// #include "Util.h" -// #include "Matcher.h" -// #include "Debug.h" -// #include "FileUtil.h" -// #include "MemoryMapped.h" -// #include "tmalign/basic_fun.h" -// #include "LDDT.h" -// #include "CalcProbTP.h" -// #include - -#ifdef OPENMP -#include -#endif - -struct Complex { - int complexId; - std::string complexName; - - unsigned int nChain; - std::vector chainKeys; - - unsigned int complexLength; - - // Coordinate16 Coords; -} - -static void getlookupInfo( - const std::string &file, - std::vector &complexes, - std::map &chainKeyToComplexIdLookup, - std::map &complexIdtoIdx - // std::map> &complexIdToChainKeysLookup, - // std::map &complexIdtoName, - // std::vector &complexIdVec -) { - if (file.length() == 0) { - return; - } - MemoryMapped lookupDB(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); - char *data = (char *) lookupDB.getData(); - char *end = data + lookupDB.mappedSize(); - const char *entry[255]; - int prevComplexId = -1; - int nComplex = 0; - while (data < end && *data != '\0') { - const size_t columns = Util::getWordsOfLine(data, entry, 255); - if (columns < 3) { - Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n"; - continue; - } - auto chainKey = Util::fast_atoi(entry[0]); - auto complexId = Util::fast_atoi(entry[2]); - chainKeyToComplexIdLookup.emplace(chainKey, complexId); - - if (complexId != prevComplexId) { - std::string chainName(entry[1], (entry[2] - entry[1]) - 1); - size_t lastUnderscoreIndex = chainName.find_last_of('_'); - std::string complexName = chainName.substr(0, lastUnderscoreIndex); - - Complex complex; - complex.complexId = complexId; - complex.nChain = 1; - complex.complexName = complexName; - complex.chainKeys.emplace_back(chainKey); - complexes.emplace_back(complex); - complexIdtoIdx.emplace(complexId, nComplex); - - prevComplexId = complexId; - nComplex++; - } - else { - complexes.back().nChain++; - complexes.back().chainKeys.emplace_back(chainKey); - } - - data = Util::skipLine(data); - } - lookupDB.close(); -} - -static void sumComplexLength (DBReader &structDbr, std::vector &complexes) { - // Fill in the complex length - for (size_t complexIdx = 0; complexIdx < complexes.size(); complexIdx++) { - Complex &cmpl = complexes[complexIdx]; - if (cmpl.chainKeys.size() == 0) { - continue; - } - unsigned int cmplId = cmpl.complexId; - unsigned int cmplLen = 0; - for (size_t chainIdx = 0; chainIdx < cmpl.chainKeys.size(); chainIdx++) { - unsigned int chainKey = cmpl.chainKeys[chainIdx]; - structDbr.get(chainKey); - cmplLen += structDbr.getSequenceLength(); - } - cmpl.complexLength = cmplLen; - } - -} - -int filtercomplex(int argc, const char **argv, const Command &command) { - LocalParameters &par = LocalParameters::getLocalInstance(); - par.parseParameters(argc, argv, command, true, 0, 0); - const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; - const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); - int dbaccessMode = (DBReader::USE_INDEX); - - IndexReader* qDbr = NULL; - qDbr = new IndexReader(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); - DBReader qStructDbr((par.db1 + "_ca").c_str(), (par.db1 + "_ca.index").c_str(), - par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); - qStructDbr.open(DBReader::NOSORT); - - IndexReader* tDbr = NULL; - DBReader *tStructDbr = NULL; - if (sameDB) { - tDbr = qDbr; - tStructDbr = &qStructDbr; - } - else{ - tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); - tStructDbr = new DBReader((par.db2 + "_ca").c_str(), (par.db2 + "_ca.index").c_str(), - par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); - tStructDbr->open(DBReader::NOSORT); - } - - DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX| DBReader::USE_DATA); - alnDbr.open(DBReader::LINEAR_ACCCESS); - size_t localThreads = 1; - -#ifdef OPENMP -localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); -#endif - - const bool shouldCompress = (par.compressed == true); - const int db4Type = Parameters::DBTYPE_CLUSTER_RES; - - DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), 1, shouldCompress, db4Type); - resultWriter.open(); - - //TODO: remove resultWrite5 when done - const int db5Type = Parameters::DBTYPE_GENERIC_DB; - DBWriter resultWrite5(par.db5.c_str(), par.db5Index.c_str(), 1, shouldCompress, db5Type); - resultWrite5.open(); - - std::string qLookupFile = par.db1 + ".lookup"; - std::string tLookupFile = par.db2 + ".lookup"; - - chainKeyToComplexId_t qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; - // complexIdToChainKeys_t qComplexIdToChainKeyMap, tComplexIdToChainKeyMap; - // std::map qcomplexIdToName, tcomplexIdToName; - // std::vector qComplexIdVec, tComplexIdVec; - std::vector qComplexes, tComplexes; - std::map qComplexIdtoIdx, tComplexIdtoIdx; - // getlookupInfo(qLookupFile, qcomplexIdToName,qChainKeyToComplexIdMap, qComplexIdToChainKeyMap, qComplexIdVec); - // std::map qComplexLength, tComplexLength; - // std::map qComplexIdResult; - getlookupInfo(qLookupFile, qComplexes, qChainKeyToComplexIdMap, qComplexIdtoIdx); - - // Fill in the complex length - sumComplexLength(qStructDbr, qComplexes); - - if (sameDB) { - tComplexes = qComplexes; - tChainKeyToComplexIdMap = qChainKeyToComplexIdMap; - tComplexIdtoIdx = qComplexIdtoIdx; - } - else { - getlookupInfo(tLookupFile, tComplexes, tChainKeyToComplexIdMap, tComplexIdtoIdx); - sumComplexLength(tStructDbr, tComplexes); - } - - return EXIT_SUCCESS; -} diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index ba9d84ef..c97f0106 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -25,9 +25,20 @@ struct Complex { unsigned int complexLength; - // Coordinate16 Coords; + // float* data; Complex() : complexId(0), complexName(""), nChain(0), complexLength(0) {} + ~Complex() { + chainKeys.clear(); + // if (data != NULL) { + // free(data); + // } + } + + // void readCoords(char *cadata, size_t caLength) { + // Coordinate16 coords; + // data = coords.read(cadata, complexLength, caLength); + // } }; unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { @@ -162,9 +173,9 @@ class ComplexFilterCriteria { return (covOK && TMOK && chainTMOK); } - void update(unsigned int qChainKey, unsigned int tChainKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qChainTm, double tChainTm) { - this->tTotalAlnLen += tTotalAlnLen; - this->qTotalAlnLen += qTotalAlnLen; + void update(unsigned int qChainKey, unsigned int tChainKey, unsigned int qAlnLen, unsigned int tAlnLen, double qChainTm, double tChainTm) { + this->tTotalAlnLen += tAlnLen; + this->qTotalAlnLen += qAlnLen; this->alignedQChainTmScores.push_back(qChainTm); this->alignedTChainTmScores.push_back(tChainTm); auto pos = std::find(qChainKeys.begin(), qChainKeys.end(), qChainKey); @@ -416,21 +427,19 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t thread_idx = static_cast(omp_get_thread_num()); #endif std::string result; - std::map tmpDBKEYut; std::map localComplexMap; - std::vector assIdsToDelete; std::map> cmplIdToBestAssId; // cmplId : [assId, alnSum] - std::vector selectedAssIDs; + std::vector filteredAssIds; Coordinate16 qcoords; Coordinate16 tcoords; Matcher::result_t res; #pragma omp for schedule(dynamic, 1) - for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexes.size(); queryComplexIdx++) { + for (size_t qComplexIdx = 0; qComplexIdx < qComplexes.size(); qComplexIdx++) { // DOING progress.updateProgress(); - Complex qComplex = qComplexes[queryComplexIdx]; + Complex qComplex = qComplexes[qComplexIdx]; unsigned int qComplexId = qComplex.complexId; std::vector qChainKeys = qComplex.chainKeys; @@ -438,21 +447,24 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int qChainKey = qChainKeys[qChainIdx]; unsigned int qChainAlnId = alnDbr.getId(qChainKey); unsigned int qChainDbId = qDbr->sequenceReader->getId(qChainKey); - //handling monomer as singleton + // Handling monomer as singleton if (qChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ char *outpos = Itoa::u32toa_sse2(qComplexId, buffer); result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); - result5.append(qComplex.complexName + "\t" + tComplexes[queryComplexIdx].complexName + "\t1.000000\t1.000000\t1.000000\t1.000000\n"); + result5.append(qComplex.complexName + "\t" + tComplexes[qComplexIdx].complexName + "\t1.000000\t1.000000\t1.000000\t1.000000\n"); break; } + // Retrieve coordinates + char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); + size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); + float* qdata = qcoords.read(qcadata, qComplex.complexLength, qCaLength); + // qComplex.readCoords(qcadata, qCaLength); + char *data = alnDbr.getData(qChainAlnId, thread_idx); while (*data != '\0' ) { ComplexDataHandler retComplex = parseScoreComplexResult(data, res); - char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); - size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); - float* qdata = qcoords.read(qcadata, res.qLen, qCaLength); if (!retComplex.isValid){ Debug(Debug::ERROR) << "No scorecomplex result provided"; @@ -460,67 +472,69 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } data = Util::skipLine(data); + unsigned int assId = retComplex.assId; - unsigned int tChainKey= res.dbKey; + unsigned int tChainKey = res.dbKey; unsigned int tChainAlnId = alnDbr.getId(tChainKey); - //if target is monomer, break to be singleton + // If target is monomer, break to be singleton if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ break; } unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); unsigned int tComplexId = tChainKeyToComplexIdMap.at(tChainKey); unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); - std::vector tChainKeys = tComplexes[tComplexIdx].chainKeys; + Complex tComplex = tComplexes[tComplexIdx]; float u[3][3]; float t[3]; fillUArr(retComplex.uString, u); fillTArr(retComplex.tString, t); - tmpDBKEYut[assId]=retComplex.uString+","+retComplex.tString; + char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); + unsigned int alnLen = cigarToAlignedLength(res.backtrace); Coordinates qm(alnLen), tm(alnLen); //FIXME: if new chainTM not required, erase those part fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); double chainTm = computeChainTmScore(qm, tm, t, u, alnLen, res.dbLen); double qChainTm = chainTm / res.qLen; - double tChainTm = chainTm/ res.dbLen; - unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + double tChainTm = chainTm / res.dbLen; + unsigned int qalnlen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + unsigned int talnlen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + + std::vector tChainKeys = tComplexes[tComplexIdx].chainKeys; if (localComplexMap.find(assId) == localComplexMap.end()) { ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, retComplex.qTmScore, retComplex.tTmScore, qChainKeys, tChainKeys, t, u); localComplexMap[assId] = cmplfiltcrit; - localComplexMap.at(assId).update(qChainKey, tChainKey, qtotalaln, ttotalaln, qChainTm, tChainTm); + localComplexMap.at(assId).update(qChainKey, tChainKey, qalnlen, talnlen, qChainTm, tChainTm); } else { - localComplexMap.at(assId).update(qChainKey, tChainKey, qtotalaln, ttotalaln, qChainTm, tChainTm); + localComplexMap.at(assId).update(qChainKey, tChainKey, qalnlen, talnlen, qChainTm, tChainTm); } } // while end } + + // Filter the target complexes for (auto& assId_res : localComplexMap){ unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes[tComplexIdx]; - std::vector tChainKeys = tComplex.chainKeys; + assId_res.second.calcCov(qComplex.complexLength, tComplex.complexLength); - if (!(assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, qComplex.nChain, tComplex.nChain))){ - assIdsToDelete.push_back(assId_res.first); + // Check criteria and pass if the criteria is not satisfied + bool satisfied = assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, qComplex.nChain, tComplex.nChain); + if (!satisfied){ + continue; } - } - for (const auto& key : assIdsToDelete) { - localComplexMap.erase(key); - } - - for (const auto& assId_res : localComplexMap){ - unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); unsigned int alnlen = adjustAlnLen(assId_res.second.qTotalAlnLen, assId_res.second.tTotalAlnLen, par.covMode); if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()){ cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } else { + // Save the best alignment based on the adjusted alignment length if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]){ cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } @@ -528,11 +542,11 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } for (const auto& pair : cmplIdToBestAssId){ - selectedAssIDs.push_back(pair.second[0]); + filteredAssIds.push_back(pair.second[0]); } - for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++){ - unsigned int assId = selectedAssIDs[assIdidx]; + for (unsigned int assIdidx = 0; assIdidx < filteredAssIds.size(); assIdidx++){ + unsigned int assId = filteredAssIds[assIdidx]; unsigned int tComplexId = tChainKeyToComplexIdMap.at(localComplexMap.at(assId).dbKey); unsigned int tComplexIdIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes[tComplexIdIdx]; @@ -542,16 +556,15 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t result.push_back('\n'); result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); } + #pragma omp critical { qComplexIdResult[qComplexId]= result; } result.clear(); localComplexMap.clear(); - tmpDBKEYut.clear(); - assIdsToDelete.clear(); cmplIdToBestAssId.clear(); - selectedAssIDs.clear(); + filteredAssIds.clear(); } // for end #pragma omp critical { From 6a94924ee8a9f457e21aa265039f4f88c7ae9322 Mon Sep 17 00:00:00 2001 From: rachelse Date: Mon, 22 Jul 2024 19:47:35 +0900 Subject: [PATCH 112/160] Corrected mistake: Saved dbKey as target complex id so far.. --- src/strucclustutils/filtermultimer.cpp | 151 ++++++++++++------------- 1 file changed, 75 insertions(+), 76 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index c97f0106..5f8f9d2d 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -25,22 +25,15 @@ struct Complex { unsigned int complexLength; - // float* data; + // Coordinate16 Coords; Complex() : complexId(0), complexName(""), nChain(0), complexLength(0) {} ~Complex() { chainKeys.clear(); - // if (data != NULL) { - // free(data); - // } } - - // void readCoords(char *cadata, size_t caLength) { - // Coordinate16 coords; - // data = coords.read(cadata, complexLength, caLength); - // } }; + unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: @@ -56,11 +49,11 @@ unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { class ComplexFilterCriteria { public: - unsigned int dbKey; + unsigned int targetComplexId; double qTM; double tTM; - std::vector qChainKeys; - std::vector tChainKeys; + // std::vector qChainKeys; + // std::vector tChainKeys; float t[3]; float u[3][3]; unsigned int qTotalAlnLen; @@ -71,8 +64,10 @@ class ComplexFilterCriteria { std::vector alignedTChainTmScores; ComplexFilterCriteria() {} - ComplexFilterCriteria(unsigned int dbKey, double qTM, double tTM, std::vector &qChainKeys, std::vector &tChainKeys, float tstring[3], float ustring[3][3]) : - dbKey(dbKey), qTM(qTM), tTM(tTM), qChainKeys(qChainKeys), tChainKeys(tChainKeys), qTotalAlnLen(0), tTotalAlnLen(0) { + // ComplexFilterCriteria(unsigned int dbKey, double qTM, double tTM, std::vector &qChainKeys, std::vector &tChainKeys, float tstring[3], float ustring[3][3]) : + // dbKey(dbKey), qTM(qTM), tTM(tTM), qChainKeys(qChainKeys), tChainKeys(tChainKeys), qTotalAlnLen(0), tTotalAlnLen(0) { + ComplexFilterCriteria(unsigned int targetComplexId, double qTM, double tTM, float tstring[3], float ustring[3][3]) : + targetComplexId(targetComplexId), qTM(qTM), tTM(tTM), qTotalAlnLen(0), tTotalAlnLen(0) { std::copy(tstring, tstring + 3, t); for (int i = 0; i < 3; i++) { std::copy(ustring[i], ustring[i] + 3, u[i]); @@ -125,12 +120,12 @@ class ComplexFilterCriteria { // } // } - bool hasChainTm(float chainTMThr, int covMode, int filterMode, unsigned int qChainNum, unsigned int tChainNum) { + bool hasChainTm(float chainTMThr, int covMode, unsigned int qChainNum, unsigned int tChainNum) { switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: - if (alignedQChainTmScores.size()tTotalAlnLen += tAlnLen; this->qTotalAlnLen += qAlnLen; this->alignedQChainTmScores.push_back(qChainTm); this->alignedTChainTmScores.push_back(tChainTm); - auto pos = std::find(qChainKeys.begin(), qChainKeys.end(), qChainKey); - if (pos != qChainKeys.end()) { - qChainKeys.erase(pos); - } - pos = std::find(tChainKeys.begin(), tChainKeys.end(), tChainKey); - if (pos != tChainKeys.end()) { - tChainKeys.erase(pos); - } + // FIXME : What is this? + // auto pos = std::find(qChainKeys.begin(), qChainKeys.end(), qChainKey); + // if (pos != qChainKeys.end()) { + // qChainKeys.erase(pos); + // } + // pos = std::find(tChainKeys.begin(), tChainKeys.end(), tChainKey); + // if (pos != tChainKeys.end()) { + // tChainKeys.erase(pos); + // } } void calcCov(unsigned int qLen, unsigned int tLen) { @@ -429,7 +428,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t std::string result; std::map localComplexMap; std::map> cmplIdToBestAssId; // cmplId : [assId, alnSum] - std::vector filteredAssIds; + std::vector selectedAssIDs; Coordinate16 qcoords; Coordinate16 tcoords; @@ -455,13 +454,12 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t result5.append(qComplex.complexName + "\t" + tComplexes[qComplexIdx].complexName + "\t1.000000\t1.000000\t1.000000\t1.000000\n"); break; } - - // Retrieve coordinates + char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); - float* qdata = qcoords.read(qcadata, qComplex.complexLength, qCaLength); - // qComplex.readCoords(qcadata, qCaLength); - + size_t qChainLen = qDbr->sequenceReader->getSeqLen(qChainDbId); + float* qdata = qcoords.read(qcadata, qChainLen, qCaLength); + char *data = alnDbr.getData(qChainAlnId, thread_idx); while (*data != '\0' ) { ComplexDataHandler retComplex = parseScoreComplexResult(data, res); @@ -472,69 +470,71 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } data = Util::skipLine(data); - unsigned int assId = retComplex.assId; unsigned int tChainKey = res.dbKey; unsigned int tChainAlnId = alnDbr.getId(tChainKey); - // If target is monomer, break to be singleton + //if target is monomer, break to be singleton if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ break; } unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); unsigned int tComplexId = tChainKeyToComplexIdMap.at(tChainKey); unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); - Complex tComplex = tComplexes[tComplexIdx]; + std::vector tChainKeys = tComplexes[tComplexIdx].chainKeys; float u[3][3]; float t[3]; - fillUArr(retComplex.uString, u); - fillTArr(retComplex.tString, t); - - char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); - size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); - float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); - - unsigned int alnLen = cigarToAlignedLength(res.backtrace); - Coordinates qm(alnLen), tm(alnLen); - //FIXME: if new chainTM not required, erase those part - fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); - double chainTm = computeChainTmScore(qm, tm, t, u, alnLen, res.dbLen); - double qChainTm = chainTm / res.qLen; - double tChainTm = chainTm / res.dbLen; + double qChainTm = 0.0; + double tChainTm = 0.0; + if (par.filtChainTmThr > 0.0f) { + fillUArr(retComplex.uString, u); + fillTArr(retComplex.tString, t); + char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); + size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); + float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); + unsigned int alnLen = cigarToAlignedLength(res.backtrace); + Coordinates qm(alnLen), tm(alnLen); + fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); + double chainTm = computeChainTmScore(qm, tm, t, u, alnLen, res.dbLen); + qChainTm = chainTm / res.qLen; + tChainTm = chainTm / res.dbLen; + } + unsigned int qalnlen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); unsigned int talnlen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - - std::vector tChainKeys = tComplexes[tComplexIdx].chainKeys; if (localComplexMap.find(assId) == localComplexMap.end()) { - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, retComplex.qTmScore, retComplex.tTmScore, qChainKeys, tChainKeys, t, u); + // ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, retComplex.qTmScore, retComplex.tTmScore, qChainKeys, tChainKeys, t, u); + // ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, retComplex.qTmScore, retComplex.tTmScore, t, u); + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); localComplexMap[assId] = cmplfiltcrit; - localComplexMap.at(assId).update(qChainKey, tChainKey, qalnlen, talnlen, qChainTm, tChainTm); + // localComplexMap.at(assId).update(qChainKey, tChainKey, qalnlen, talnlen, qChainTm, tChainTm); + localComplexMap.at(assId).update(qalnlen, talnlen, qChainTm, tChainTm); } else { - localComplexMap.at(assId).update(qChainKey, tChainKey, qalnlen, talnlen, qChainTm, tChainTm); + // localComplexMap.at(assId).update(qChainKey, tChainKey, qalnlen, talnlen, qChainTm, tChainTm); + localComplexMap.at(assId).update(qalnlen, talnlen, qChainTm, tChainTm); } } // while end } - // Filter the target complexes + // Filter the target complexes and get the best alignment for (auto& assId_res : localComplexMap){ - unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); + unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.targetComplexId); unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes[tComplexIdx]; assId_res.second.calcCov(qComplex.complexLength, tComplex.complexLength); - // Check criteria and pass if the criteria is not satisfied - bool satisfied = assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, qComplex.nChain, tComplex.nChain); - if (!satisfied){ + // Check if the criteria is satisfied + if (!(assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, qComplex.nChain, tComplex.nChain))){ continue; } unsigned int alnlen = adjustAlnLen(assId_res.second.qTotalAlnLen, assId_res.second.tTotalAlnLen, par.covMode); + // Get the best alignement per each target complex if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()){ cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } else { - // Save the best alignment based on the adjusted alignment length if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]){ cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } @@ -542,12 +542,12 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } for (const auto& pair : cmplIdToBestAssId){ - filteredAssIds.push_back(pair.second[0]); + selectedAssIDs.push_back(pair.second[0]); } - for (unsigned int assIdidx = 0; assIdidx < filteredAssIds.size(); assIdidx++){ - unsigned int assId = filteredAssIds[assIdidx]; - unsigned int tComplexId = tChainKeyToComplexIdMap.at(localComplexMap.at(assId).dbKey); + for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++){ + unsigned int assId = selectedAssIDs[assIdidx]; + unsigned int tComplexId = tChainKeyToComplexIdMap.at(localComplexMap.at(assId).targetComplexId); unsigned int tComplexIdIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes[tComplexIdIdx]; @@ -556,7 +556,6 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t result.push_back('\n'); result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); } - #pragma omp critical { qComplexIdResult[qComplexId]= result; @@ -564,7 +563,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t result.clear(); localComplexMap.clear(); cmplIdToBestAssId.clear(); - filteredAssIds.clear(); + selectedAssIDs.clear(); } // for end #pragma omp critical { From 3fd0dab73228c0a9210ced88bf7fa807a483401f Mon Sep 17 00:00:00 2001 From: rachelse Date: Mon, 22 Jul 2024 20:18:29 +0900 Subject: [PATCH 113/160] Corrected targetcomplexid mistake & chain number comparison --- src/strucclustutils/filtermultimer.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 5f8f9d2d..461f033b 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -49,6 +49,7 @@ unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { class ComplexFilterCriteria { public: + // unsigned int dbKey; //FIXME: dbkey is key of chain? unsigned int targetComplexId; double qTM; double tTM; @@ -505,7 +506,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t if (localComplexMap.find(assId) == localComplexMap.end()) { // ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, retComplex.qTmScore, retComplex.tTmScore, qChainKeys, tChainKeys, t, u); // ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, retComplex.qTmScore, retComplex.tTmScore, t, u); - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); // FIXME: Critical mistake localComplexMap[assId] = cmplfiltcrit; // localComplexMap.at(assId).update(qChainKey, tChainKey, qalnlen, talnlen, qChainTm, tChainTm); localComplexMap.at(assId).update(qalnlen, talnlen, qChainTm, tChainTm); @@ -548,8 +549,8 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++){ unsigned int assId = selectedAssIDs[assIdidx]; unsigned int tComplexId = tChainKeyToComplexIdMap.at(localComplexMap.at(assId).targetComplexId); - unsigned int tComplexIdIdx = tComplexIdToIdx.at(tComplexId); - Complex tComplex = tComplexes[tComplexIdIdx]; + unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); + Complex tComplex = tComplexes[tComplexIdx]; char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); result.append(buffer, (outpos - buffer - 1)); From a8f0a091d12c8f2fd8f67b5b483ff6886ca2065b Mon Sep 17 00:00:00 2001 From: rachelse Date: Mon, 22 Jul 2024 21:21:56 +0900 Subject: [PATCH 114/160] The mistake was not a big problem. One stage before putting iLDDT code --- src/strucclustutils/filtermultimer.cpp | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 461f033b..b5e8f295 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -122,11 +122,11 @@ class ComplexFilterCriteria { // } bool hasChainTm(float chainTMThr, int covMode, unsigned int qChainNum, unsigned int tChainNum) { + if (alignedQChainTmScores.size() Date: Tue, 23 Jul 2024 20:06:35 +0900 Subject: [PATCH 115/160] createtsv with --threads 1 to make complex_db_h in order --- data/multimercluster.sh | 4 ++-- src/workflow/MultimerCluster.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data/multimercluster.sh b/data/multimercluster.sh index 56ef5bb0..21d3f7b9 100644 --- a/data/multimercluster.sh +++ b/data/multimercluster.sh @@ -100,7 +100,8 @@ if notExists "${TMP_PATH}/complex_db_h.dbtype"; then # "$MMSEQS" tsv2db "${INPUT}.source" "${TMP_PATH}/complex_db_header_tmp" ${VERBOSITY_PAR} \ # || fail "tsv2db died" # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h.tsv" ${VERBOSITY_PAR} \ + # "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h.tsv" ${VERBOSITY_PAR} \ + "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h.tsv" --threads 1 \ || fail "createtsv died" buldCmplhDb "${TMP_PATH}/chain_db_h.tsv" "${TMP_PATH}/complex_header.tsv" # shellcheck disable=SC2086 @@ -122,7 +123,6 @@ if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_result" ${VERBOSITY_PAR} rm "${TMP_PATH}/complex_header.tsv" - rm "${TMP_PATH}/complex_header.tsv_redundant" rm -rf "${TMP_PATH}/multimersearch_tmp" rm -f "${TMP_PATH}/multimercluster.sh" fi \ No newline at end of file diff --git a/src/workflow/MultimerCluster.cpp b/src/workflow/MultimerCluster.cpp index c576fb8b..32c1e34a 100644 --- a/src/workflow/MultimerCluster.cpp +++ b/src/workflow/MultimerCluster.cpp @@ -61,8 +61,8 @@ int multimercluster(int argc, const char **argv, const Command &command) { cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clust).c_str()); cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str()); - cmd.addVariable("VERBCOMPRESS", par.createParameterString(par.verbandcompression).c_str()); - + // cmd.addVariable("VERBCOMPRESS", par.createParameterString(par.verbandcompression).c_str()); + std::string program = tmpDir + "/multimercluster.sh"; FileUtil::writeFile(program, multimercluster_sh, multimercluster_sh_len); cmd.execProgram(program.c_str(), par.filenames); From e35f355fa0a36ce4eb522fe7c339f9edf9d57b0c Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 24 Jul 2024 18:50:44 +0900 Subject: [PATCH 116/160] setting default parameter collides with existing default values --- src/workflow/MultimerCluster.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/workflow/MultimerCluster.cpp b/src/workflow/MultimerCluster.cpp index 32c1e34a..a33ec6a5 100644 --- a/src/workflow/MultimerCluster.cpp +++ b/src/workflow/MultimerCluster.cpp @@ -9,19 +9,11 @@ #include "multimercluster.sh.h" void setMultimerClusterDefaults(LocalParameters *p) { - p->covThr = 0.8; p->filtMultimerTmThr = 0.5; // FIX - // p->filtChainTmThr=0.0; // FIX + p->filtChainTmThr=0.0; // FIX p->filterMode=0; - p->covMode = 1; - p->clusteringMode = Parameters::GREEDY; - p->removeTmpFiles = true; } -void setMultimerClusterMustPassAlong(Parameters *p) { - p->PARAM_C.wasSet = true; - p->PARAM_REMOVE_TMP_FILES.wasSet = true; -} int multimercluster(int argc, const char **argv, const Command &command) { LocalParameters &par = LocalParameters::getLocalInstance(); par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT); //align @@ -38,7 +30,6 @@ int multimercluster(int argc, const char **argv, const Command &command) { setMultimerClusterDefaults(&par); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); - setMultimerClusterMustPassAlong(&par); std::string tmpDir = par.filenames.back(); std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, *command.params)); From 27756597036838043fa00d58ace316e1583d1731 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 25 Jul 2024 14:56:36 +0900 Subject: [PATCH 117/160] changed order of elements in struct and class for memory --- src/strucclustutils/filtermultimer.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index b5e8f295..338312b0 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -19,15 +19,14 @@ struct Complex { int complexId; - std::string complexName; unsigned int nChain; - std::vector chainKeys; - unsigned int complexLength; + std::string complexName; + std::vector chainKeys; // Coordinate16 Coords; - Complex() : complexId(0), complexName(""), nChain(0), complexLength(0) {} + Complex() : complexId(0), nChain(0), complexLength(0), complexName("") {} ~Complex() { chainKeys.clear(); } @@ -51,16 +50,16 @@ class ComplexFilterCriteria { public: // unsigned int dbKey; //FIXME: dbkey is key of chain? unsigned int targetComplexId; - double qTM; - double tTM; - // std::vector qChainKeys; - // std::vector tChainKeys; - float t[3]; - float u[3][3]; unsigned int qTotalAlnLen; unsigned int tTotalAlnLen; + float t[3]; + float u[3][3]; float qCov; float tCov; + double qTM; + double tTM; + // std::vector qChainKeys; + // std::vector tChainKeys; std::vector alignedQChainTmScores; std::vector alignedTChainTmScores; @@ -68,7 +67,7 @@ class ComplexFilterCriteria { // ComplexFilterCriteria(unsigned int dbKey, double qTM, double tTM, std::vector &qChainKeys, std::vector &tChainKeys, float tstring[3], float ustring[3][3]) : // dbKey(dbKey), qTM(qTM), tTM(tTM), qChainKeys(qChainKeys), tChainKeys(tChainKeys), qTotalAlnLen(0), tTotalAlnLen(0) { ComplexFilterCriteria(unsigned int targetComplexId, double qTM, double tTM, float tstring[3], float ustring[3][3]) : - targetComplexId(targetComplexId), qTM(qTM), tTM(tTM), qTotalAlnLen(0), tTotalAlnLen(0) { + targetComplexId(targetComplexId), qTotalAlnLen(0), tTotalAlnLen(0), qTM(qTM), tTM(tTM) { std::copy(tstring, tstring + 3, t); for (int i = 0; i < 3; i++) { std::copy(ustring[i], ustring[i] + 3, u[i]); From 3d26d2ee59e54b070c188ef3948be29e24ad3583 Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 25 Jul 2024 15:01:37 +0900 Subject: [PATCH 118/160] commit before pull --- src/commons/LocalParameters.cpp | 3 + src/commons/LocalParameters.h | 2 + src/strucclustutils/filtermultimer.cpp | 160 ++++++++++++++----------- src/workflow/MultimerCluster.cpp | 1 + 4 files changed, 98 insertions(+), 68 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 5b19d89c..72f0ceee 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -36,6 +36,7 @@ LocalParameters::LocalParameters() : PARAM_GPU(PARAM_GPU_ID, "--gpu", "Use GPU", "Use GPU (CUDA) if possible", typeid(int), (void *) &gpu, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON), PARAM_MULTIMER_TM_THRESHOLD(PARAM_MULTIMER_TM_THRESHOLD_ID,"--multimer-tm-threshold", "TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtMultimerTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "chain TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), + PARAM_INTERFACE_LDDT_THRESHOLD(PARAM_INTERFACE_LDDT_THRESHOLD_ID,"--interface-lddt-threshold", "Interface LDDT threshold", "accept alignments with a lddt > thr [0.0,1.0]",typeid(float), (void *) &filtInterfaceLddtThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), PARAM_FILTER_MODE(PARAM_FILTER_MODE_ID, "--filter-mode", "Filter mode", "0: Interface\n1: Conformation\n2: loose", typeid(int), (void *) &filterMode, "[0-2]{0}$", MMseqsParameter::COMMAND_CLUST) { @@ -196,6 +197,7 @@ LocalParameters::LocalParameters() : filtermultimer.push_back(&PARAM_COV_MODE); filtermultimer.push_back(&PARAM_MULTIMER_TM_THRESHOLD); filtermultimer.push_back(&PARAM_CHAIN_TM_THRESHOLD); + filtermultimer.push_back(&PARAM_INTERFACE_LDDT_THRESHOLD); filtermultimer.push_back(&PARAM_FILTER_MODE); // createmultimerreport @@ -270,6 +272,7 @@ LocalParameters::LocalParameters() : citations.emplace(CITATION_PROSTT5, "Heinzinger, M., Weissenow, K., Gomez Sanchez, J., Henkel, A., Mirdita, M., Steinegger, M., Steinegger, M., and Burkhard, R. Bilingual Language Model for Protein Sequence and Structure. bioRxiv, doi:10.1101/2023.07.23.550085 (2024)"); filtMultimerTmThr = 0.0; filtChainTmThr = 0.0; + filtInterfaceLddtThr = 0.0; filterMode = 0; prostt5Model = ""; gpu = 0; diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index cbee8feb..43b37e1b 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -138,6 +138,7 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_GPU) PARAMETER(PARAM_MULTIMER_TM_THRESHOLD) PARAMETER(PARAM_CHAIN_TM_THRESHOLD) + PARAMETER(PARAM_INTERFACE_LDDT_THRESHOLD) PARAMETER(PARAM_FILTER_MODE) @@ -167,6 +168,7 @@ class LocalParameters : public Parameters { int pdbOutputMode; float filtMultimerTmThr; float filtChainTmThr; + float filtInterfaceLddtThr; int filterMode; std::string prostt5Model; int gpu; diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 338312b0..f0ee7161 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -32,6 +32,14 @@ struct Complex { } }; +// struct AlignedComplex { +// unsigned int assId; +// std::vector qAlnChains; +// std::vector tAlnChains; +// std::vector qAlnCoords; +// std::vector tAlnCoords; +// }; + unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { switch (covMode) { @@ -48,24 +56,28 @@ unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { class ComplexFilterCriteria { public: - // unsigned int dbKey; //FIXME: dbkey is key of chain? unsigned int targetComplexId; + + // per complex + double qTM; + double tTM; unsigned int qTotalAlnLen; unsigned int tTotalAlnLen; - float t[3]; - float u[3][3]; float qCov; float tCov; - double qTM; - double tTM; - // std::vector qChainKeys; - // std::vector tChainKeys; - std::vector alignedQChainTmScores; - std::vector alignedTChainTmScores; + float t[3]; + float u[3][3]; + + // per chain : criteria for chainTmThr & lddtThr + std::vector qAlnChainKeys; + std::vector tAlnChainKeys; + std::vector qAlnCoords; + std::vector tAlnCoords; + + std::vector qAlnChainTMs; + std::vector tAlnChainTMs; ComplexFilterCriteria() {} - // ComplexFilterCriteria(unsigned int dbKey, double qTM, double tTM, std::vector &qChainKeys, std::vector &tChainKeys, float tstring[3], float ustring[3][3]) : - // dbKey(dbKey), qTM(qTM), tTM(tTM), qChainKeys(qChainKeys), tChainKeys(tChainKeys), qTotalAlnLen(0), tTotalAlnLen(0) { ComplexFilterCriteria(unsigned int targetComplexId, double qTM, double tTM, float tstring[3], float ustring[3][3]) : targetComplexId(targetComplexId), qTotalAlnLen(0), tTotalAlnLen(0), qTM(qTM), tTM(tTM) { std::copy(tstring, tstring + 3, t); @@ -74,8 +86,12 @@ class ComplexFilterCriteria { } } ~ComplexFilterCriteria() { - alignedQChainTmScores.clear(); - alignedTChainTmScores.clear(); + qAlnChainTMs.clear(); + tAlnChainTMs.clear(); + qAlnChainKeys.clear(); + tAlnChainKeys.clear(); + qAlnCoords.clear(); + tAlnCoords.clear(); } bool hasTM(float TMThr, int covMode, int filterMode){ @@ -96,11 +112,11 @@ class ComplexFilterCriteria { // case LocalParameters::FILTER_MODE_INTERFACE: // switch (covMode) { // case Parameters::COV_MODE_BIDIRECTIONAL: - // return (alignedQChainTmScores.size()==qChainNum && qChainNum==tChainNum); + // return (qAlnChainTMs.size()==qChainNum && qChainNum==tChainNum); // case Parameters::COV_MODE_TARGET: - // return (alignedTChainTmScores.size()==tChainNum); + // return (tAlnChainTMs.size()==tChainNum); // case Parameters::COV_MODE_QUERY: - // return (alignedQChainTmScores.size()==qChainNum); + // return (qAlnChainTMs.size()==qChainNum); // default: // return true; // } @@ -121,27 +137,27 @@ class ComplexFilterCriteria { // } bool hasChainTm(float chainTMThr, int covMode, unsigned int qChainNum, unsigned int tChainNum) { - if (alignedQChainTmScores.size()tTotalAlnLen += tAlnLen; - this->qTotalAlnLen += qAlnLen; - this->alignedQChainTmScores.push_back(qChainTm); - this->alignedTChainTmScores.push_back(tChainTm); - // FIXME : What is this? - // auto pos = std::find(qChainKeys.begin(), qChainKeys.end(), qChainKey); - // if (pos != qChainKeys.end()) { - // qChainKeys.erase(pos); - // } - // pos = std::find(tChainKeys.begin(), tChainKeys.end(), tChainKey); - // if (pos != tChainKeys.end()) { - // tChainKeys.erase(pos); - // } + void updateAln(unsigned int qAlnLen, unsigned int tAlnLen) { + qTotalAlnLen += qAlnLen; + tTotalAlnLen += tAlnLen; + } + + void updateChainTm(double qChainTm, double tChainTm) { + qAlnChainTMs.push_back(qChainTm); + tAlnChainTMs.push_back(tChainTm); + } + + void saveAlnCoords(unsigned int qChainKey, unsigned int tChainKey, Coordinates &qAlnCoords, Coordinates &tAlnCoords) { + this->qAlnChainKeys.push_back(qChainKey); + this->tAlnChainKeys.push_back(tChainKey); + // this->qAlnCoords.push_back(qAlnCoords); + // this->tAlnCoords.push_back(tAlnCoords); } + // void update(unsigned int qChainKey, unsigned int tChainKey, double qChainTm, double tChainTm) { + // this->qAlnChainTMs.push_back(qChainTm); + // this->tAlnChainTMs.push_back(tChainTm); + + // this->qAlnChainKeys.push_back(qChainKey); + // this->tAlnChainKeys.push_back(tChainKey); + // // FIXME : What is this? + // } + void calcCov(unsigned int qLen, unsigned int tLen) { qCov = static_cast(qTotalAlnLen) / static_cast(qLen); tCov = static_cast(tTotalAlnLen) / static_cast(tLen); @@ -282,7 +306,6 @@ double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u void getComplexResidueLength( IndexReader *Dbr, std::vector &complexes) { for (size_t complexIdx = 0; complexIdx < complexes.size(); complexIdx++) { Complex *complex = &complexes[complexIdx]; - unsigned int complexId = complex->complexId; std::vector &chainKeys = complex->chainKeys; if (chainKeys.empty()) { continue; @@ -400,7 +423,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t getlookupInfo(qLookupFile, qChainKeyToComplexIdMap, qComplexes, qComplexIdToIdx); getComplexResidueLength(qDbr, qComplexes); Debug::Progress progress(qComplexes.size()); - std::map qComplexIdResult; + std::map qComplexIdResult; if (sameDB) { tChainKeyToComplexIdMap = qChainKeyToComplexIdMap; @@ -419,7 +442,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t #ifdef OPENMP thread_idx = static_cast(omp_get_thread_num()); #endif - std::string result; + resultToWrite_t result; std::map localComplexMap; std::map> cmplIdToBestAssId; // cmplId : [assId, alnSum] std::vector selectedAssIDs; @@ -477,47 +500,46 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t float u[3][3]; float t[3]; - double qChainTm = 0.0; - double tChainTm = 0.0; - if (par.filtChainTmThr > 0.0f) { - fillUArr(retComplex.uString, u); - fillTArr(retComplex.tString, t); + fillUArr(retComplex.uString, u); + fillTArr(retComplex.tString, t); + unsigned int qalnlen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); + unsigned int talnlen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); + if (localComplexMap.find(assId) == localComplexMap.end()) { + ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); + localComplexMap[assId] = cmplfiltcrit; + } + localComplexMap.at(assId).updateAln(qalnlen, talnlen); + + // do extra calculations if needed : chainTm, iLDDT + if (par.filtChainTmThr > 0.0f || par.filtInterfaceLddtThr > 0.0f) { char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); unsigned int alnLen = cigarToAlignedLength(res.backtrace); Coordinates qm(alnLen), tm(alnLen); fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); - double chainTm = computeChainTmScore(qm, tm, t, u, alnLen, res.dbLen); - qChainTm = chainTm / res.qLen; - tChainTm = chainTm / res.dbLen; - } - unsigned int qalnlen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - unsigned int talnlen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - if (localComplexMap.find(assId) == localComplexMap.end()) { - // ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, retComplex.qTmScore, retComplex.tTmScore, qChainKeys, tChainKeys, t, u); - // ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, retComplex.qTmScore, retComplex.tTmScore, t, u); - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); // FIXME: Critical mistake - localComplexMap[assId] = cmplfiltcrit; - // localComplexMap.at(assId).update(qChainKey, tChainKey, qalnlen, talnlen, qChainTm, tChainTm); - localComplexMap.at(assId).update(qalnlen, talnlen, qChainTm, tChainTm); - } else { - // localComplexMap.at(assId).update(qChainKey, tChainKey, qalnlen, talnlen, qChainTm, tChainTm); - localComplexMap.at(assId).update(qalnlen, talnlen, qChainTm, tChainTm); + if (par.filtChainTmThr > 0.0f) { + double chainTm = computeChainTmScore(qm, tm, t, u, alnLen, res.dbLen); + localComplexMap.at(assId).updateChainTm(chainTm / res.qLen, chainTm / res.dbLen); + } + + // DOING + if (par.filtInterfaceLddtThr > 0.0f) { + localComplexMap.at(assId).saveAlnCoords(qChainKey, tChainKey, qm, tm); + } } - } // while end } // Filter the target complexes and get the best alignment for (auto& assId_res : localComplexMap){ - // unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.targetComplexId); unsigned int tComplexId = assId_res.second.targetComplexId; unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); - Complex tComplex = tComplexes[tComplexIdx]; + Complex tComplex = tComplexes[tComplexIdx]; assId_res.second.calcCov(qComplex.complexLength, tComplex.complexLength); + // TODO: Do something for interface LDDT // Check if the criteria are met if (!(assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, qComplex.nChain, tComplex.nChain))){ continue; @@ -541,7 +563,6 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++){ unsigned int assId = selectedAssIDs[assIdidx]; - // unsigned int tComplexId = tChainKeyToComplexIdMap.at(localComplexMap.at(assId).targetComplexId); unsigned int tComplexId = localComplexMap.at(assId).targetComplexId; unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes[tComplexIdx]; @@ -551,6 +572,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t result.push_back('\n'); result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); } + #pragma omp critical { qComplexIdResult[qComplexId]= result; @@ -563,9 +585,10 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t #pragma omp critical { resultWrite5.writeData(result5.c_str(), result5.length(), 0); + result5.clear(); } - result5.clear(); } // MP end + for (auto &pair : qComplexIdResult){ resultWriter.writeData(pair.second.c_str(), pair.second.length(), pair.first); } @@ -583,6 +606,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t tChainKeyToComplexIdMap.clear(); qComplexes.clear(); tComplexes.clear(); + qComplexIdResult.clear(); return EXIT_SUCCESS; } \ No newline at end of file diff --git a/src/workflow/MultimerCluster.cpp b/src/workflow/MultimerCluster.cpp index c576fb8b..4d4010c7 100644 --- a/src/workflow/MultimerCluster.cpp +++ b/src/workflow/MultimerCluster.cpp @@ -12,6 +12,7 @@ void setMultimerClusterDefaults(LocalParameters *p) { p->covThr = 0.8; p->filtMultimerTmThr = 0.5; // FIX // p->filtChainTmThr=0.0; // FIX + // p->filtInterfaceLddtThr = 0.0; // FIX p->filterMode=0; p->covMode = 1; p->clusteringMode = Parameters::GREEDY; From 50208e9bd1668ff55851193b929d7e5bae024f2f Mon Sep 17 00:00:00 2001 From: rachelse Date: Thu, 25 Jul 2024 15:07:51 +0900 Subject: [PATCH 119/160] Merged commit with review --- src/strucclustutils/filtermultimer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index f0ee7161..95d8a6ef 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -59,12 +59,12 @@ class ComplexFilterCriteria { unsigned int targetComplexId; // per complex - double qTM; - double tTM; unsigned int qTotalAlnLen; unsigned int tTotalAlnLen; float qCov; float tCov; + double qTM; + double tTM; float t[3]; float u[3][3]; From c86d2ce3375258b80dd7906b3f016b59ad7e0aa4 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 25 Jul 2024 18:27:09 +0900 Subject: [PATCH 120/160] solved complex_db_h for monomers --- data/multimercluster.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data/multimercluster.sh b/data/multimercluster.sh index 21d3f7b9..ba7b5732 100644 --- a/data/multimercluster.sh +++ b/data/multimercluster.sh @@ -53,12 +53,12 @@ buldCmplhDb(){ { split($2,words," ") split(words[1],parts,"_") - output_string="" - for (j = 1; j < length(parts); j++) { - output_string = output_string parts[j] + output_string=parts[1] + for (j = 2; j < length(parts); j++) { if (j < length(parts)-1){ output_string=output_string"_" } + output_string = output_string parts[j] } headerstring="" for (k = 2; k < length(words)+1; k++) { From e478a324efaaf3c294c898a280761fc478e241ae Mon Sep 17 00:00:00 2001 From: rachelse Date: Mon, 29 Jul 2024 17:55:16 +0900 Subject: [PATCH 121/160] Saved aligned coordinates into vector but cannot use SIMD operations --- src/strucclustutils/filtermultimer.cpp | 142 +++++++++++++------------ src/workflow/MultimerCluster.cpp | 4 - 2 files changed, 74 insertions(+), 72 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 95d8a6ef..63655250 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -32,14 +32,11 @@ struct Complex { } }; -// struct AlignedComplex { -// unsigned int assId; -// std::vector qAlnChains; -// std::vector tAlnChains; -// std::vector qAlnCoords; -// std::vector tAlnCoords; -// }; - +struct AlignedChain { + std::vector x; + std::vector y; + std::vector z; +}; unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { switch (covMode) { @@ -71,8 +68,9 @@ class ComplexFilterCriteria { // per chain : criteria for chainTmThr & lddtThr std::vector qAlnChainKeys; std::vector tAlnChainKeys; - std::vector qAlnCoords; - std::vector tAlnCoords; + std::vector alnChainLens; + std::vector qAlnChains; + std::vector tAlnChains; std::vector qAlnChainTMs; std::vector tAlnChainTMs; @@ -90,8 +88,9 @@ class ComplexFilterCriteria { tAlnChainTMs.clear(); qAlnChainKeys.clear(); tAlnChainKeys.clear(); - qAlnCoords.clear(); - tAlnCoords.clear(); + alnChainLens.clear(); + qAlnChains.clear(); + tAlnChains.clear(); } bool hasTM(float TMThr, int covMode, int filterMode){ @@ -189,13 +188,41 @@ class ComplexFilterCriteria { tAlnChainTMs.push_back(tChainTm); } - void saveAlnCoords(unsigned int qChainKey, unsigned int tChainKey, Coordinates &qAlnCoords, Coordinates &tAlnCoords) { - this->qAlnChainKeys.push_back(qChainKey); - this->tAlnChainKeys.push_back(tChainKey); - // this->qAlnCoords.push_back(qAlnCoords); - // this->tAlnCoords.push_back(tAlnCoords); + void fillChainAlignment(unsigned int qChainKey, unsigned int tChainKey, + float *qdata, float *tdata, const std::string &cigar, int qStartPos, int tStartPos, int qLen, int tLen) { + AlignedChain qChain; + AlignedChain tChain; + + int qi = qStartPos; + int ti = tStartPos; + int mi = 0; + + std::string backtrace = Matcher::uncompressAlignment(cigar); + for (size_t btPos = 0; btPos < backtrace.size(); btPos++) { + if (backtrace[btPos] == 'M') { + qChain.x.push_back(qdata[qi]); + qChain.y.push_back(qdata[qLen + qi]); + qChain.z.push_back(qdata[2*qLen + qi]); + tChain.x.push_back(tdata[ti]); + tChain.y.push_back(tdata[tLen + ti]); + tChain.z.push_back(tdata[2*tLen + ti]); + qi++; + ti++; + mi++; + } + else if (backtrace[btPos] == 'I') { + qi++; + } + else { + ti++; + } + } + qAlnChainKeys.push_back(qChainKey); + tAlnChainKeys.push_back(tChainKey); + qAlnChains.push_back(qChain); + tAlnChains.push_back(tChain); + alnChainLens.push_back(mi); } - // void update(unsigned int qChainKey, unsigned int tChainKey, double qChainTm, double tChainTm) { // this->qAlnChainTMs.push_back(qChainTm); // this->tAlnChainTMs.push_back(tChainTm); @@ -209,6 +236,7 @@ class ComplexFilterCriteria { qCov = static_cast(qTotalAlnLen) / static_cast(qLen); tCov = static_cast(tTotalAlnLen) / static_cast(tLen); } + }; void fillUArr(const std::string &uString, float (&u)[3][3]) { @@ -261,43 +289,16 @@ unsigned int cigarToAlignedLength(const std::string &cigar){ return alni; } -void fillMatchedCoord(float * qdata, float * tdata, - Coordinates &qm, Coordinates &tm, - const std::string &cigar, int qStartPos, int tStartPos, int qLen, int tLen) { - int qi = qStartPos; - int ti = tStartPos; - int mi = 0; - - std::string backtrace = Matcher::uncompressAlignment(cigar); - for (size_t btPos = 0; btPos < backtrace.size(); btPos++) { - if (backtrace[btPos] == 'M') { - qm.x[mi] = qdata[qi]; - qm.y[mi] = qdata[qLen + qi]; - qm.z[mi] = qdata[2*qLen + qi]; - tm.x[mi] = tdata[ti]; - tm.y[mi] = tdata[tLen + ti]; - tm.z[mi] = tdata[2*tLen + ti]; - qi++; - ti++; - mi++; - } - else if (backtrace[btPos] == 'I') { - qi++; - } - else { - ti++; - } - } -} - -double computeChainTmScore(Coordinates &qm, Coordinates &tm, float t[3], float u[3][3], unsigned int alnLen, int tLen) { +double computeChainTmScore(AlignedChain &qchain, AlignedChain &tchain, float t[3], float u[3][3], int tLen, unsigned int alnLen) { double tmscore = 0; float d0 = 1.24*(cbrt(tLen-15)) -1.8; float d02 = d0*d0; - Coordinates tmt(alnLen); - BasicFunction::do_rotation(tm, tmt, alnLen, t, u); + for (unsigned int k=0; k 0.0f || par.filtInterfaceLddtThr > 0.0f) { char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); - unsigned int alnLen = cigarToAlignedLength(res.backtrace); - Coordinates qm(alnLen), tm(alnLen); - fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); + + cmplfiltcrit.fillChainAlignment(qChainKey, tChainKey, qdata, tdata, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); if (par.filtChainTmThr > 0.0f) { - double chainTm = computeChainTmScore(qm, tm, t, u, alnLen, res.dbLen); - localComplexMap.at(assId).updateChainTm(chainTm / res.qLen, chainTm / res.dbLen); + double chainTm = computeChainTmScore(cmplfiltcrit.qAlnChains.back(), cmplfiltcrit.tAlnChains.back(), t, u, res.dbLen, cmplfiltcrit.alnChainLens.back()); + cmplfiltcrit.updateChainTm(chainTm / res.qLen, chainTm / res.dbLen); } - // DOING - if (par.filtInterfaceLddtThr > 0.0f) { - localComplexMap.at(assId).saveAlnCoords(qChainKey, tChainKey, qm, tm); - } + // // DOING + // if (par.filtInterfaceLddtThr > 0.0f) { + // } } } // while end } @@ -538,14 +538,19 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes[tComplexIdx]; - assId_res.second.calcCov(qComplex.complexLength, tComplex.complexLength); + ComplexFilterCriteria &cmplfiltcrit = assId_res.second; + cmplfiltcrit.calcCov(qComplex.complexLength, tComplex.complexLength); + if (par.filtChainTmThr > 0.0) { + // TODO + } // TODO: Do something for interface LDDT + // Check if the criteria are met - if (!(assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, qComplex.nChain, tComplex.nChain))){ + if (!(cmplfiltcrit.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, qComplex.nChain, tComplex.nChain))){ continue; } - unsigned int alnlen = adjustAlnLen(assId_res.second.qTotalAlnLen, assId_res.second.tTotalAlnLen, par.covMode); + unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); // Get the best alignement per each target complex if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()){ cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; @@ -563,14 +568,15 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++){ unsigned int assId = selectedAssIDs[assIdidx]; - unsigned int tComplexId = localComplexMap.at(assId).targetComplexId; + ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); + unsigned int tComplexId = cmplfiltcrit.targetComplexId; unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes[tComplexIdx]; char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); - result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); + result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(cmplfiltcrit.qCov) + "\t" + std::to_string(cmplfiltcrit.tCov) + "\t"+ std::to_string(cmplfiltcrit.qTM)+"\t"+ std::to_string(cmplfiltcrit.tTM)+ "\n"); } #pragma omp critical diff --git a/src/workflow/MultimerCluster.cpp b/src/workflow/MultimerCluster.cpp index c73c4f71..22d8582d 100644 --- a/src/workflow/MultimerCluster.cpp +++ b/src/workflow/MultimerCluster.cpp @@ -10,12 +10,8 @@ void setMultimerClusterDefaults(LocalParameters *p) { p->filtMultimerTmThr = 0.5; // FIX -<<<<<<< HEAD // p->filtChainTmThr=0.0; // FIX // p->filtInterfaceLddtThr = 0.0; // FIX -======= - p->filtChainTmThr=0.0; // FIX ->>>>>>> master p->filterMode=0; } From 7b5e7287a27de01bdc3b88a79c42db68894bb576 Mon Sep 17 00:00:00 2001 From: rachelse Date: Tue, 30 Jul 2024 17:20:52 +0900 Subject: [PATCH 122/160] Implemented interfaceLDDT but naive --- src/strucclustutils/filtermultimer.cpp | 201 ++++++++++++++++--------- 1 file changed, 130 insertions(+), 71 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 63655250..35cdee2d 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -10,8 +10,9 @@ #include "Coordinate16.h" #include "tmalign/basic_fun.h" #include "MultimerUtil.h" -// #include "LDDT.h" +#include "LDDT.h" #include +#include #ifdef OPENMP #include @@ -22,6 +23,7 @@ struct Complex { unsigned int nChain; unsigned int complexLength; std::string complexName; + std::vector chainLengths; std::vector chainKeys; // Coordinate16 Coords; @@ -32,7 +34,7 @@ struct Complex { } }; -struct AlignedChain { +struct AlignedCoordinate { std::vector x; std::vector y; std::vector z; @@ -60,47 +62,46 @@ class ComplexFilterCriteria { unsigned int tTotalAlnLen; float qCov; float tCov; - double qTM; - double tTM; + float interfaceLddt; + double qTm; + double tTm; float t[3]; float u[3][3]; // per chain : criteria for chainTmThr & lddtThr std::vector qAlnChainKeys; std::vector tAlnChainKeys; - std::vector alnChainLens; - std::vector qAlnChains; - std::vector tAlnChains; + std::vector qAlnChains; + std::vector tAlnChains; - std::vector qAlnChainTMs; - std::vector tAlnChainTMs; + std::vector qAlnChainTms; + std::vector tAlnChainTms; ComplexFilterCriteria() {} - ComplexFilterCriteria(unsigned int targetComplexId, double qTM, double tTM, float tstring[3], float ustring[3][3]) : - targetComplexId(targetComplexId), qTotalAlnLen(0), tTotalAlnLen(0), qTM(qTM), tTM(tTM) { + ComplexFilterCriteria(unsigned int targetComplexId, double qTm, double tTm, float tstring[3], float ustring[3][3]) : + targetComplexId(targetComplexId), qTotalAlnLen(0), tTotalAlnLen(0), interfaceLddt(0), qTm(qTm), tTm(tTm) { std::copy(tstring, tstring + 3, t); for (int i = 0; i < 3; i++) { std::copy(ustring[i], ustring[i] + 3, u[i]); } } ~ComplexFilterCriteria() { - qAlnChainTMs.clear(); - tAlnChainTMs.clear(); + qAlnChainTms.clear(); + tAlnChainTms.clear(); qAlnChainKeys.clear(); tAlnChainKeys.clear(); - alnChainLens.clear(); qAlnChains.clear(); tAlnChains.clear(); } - bool hasTM(float TMThr, int covMode, int filterMode){ + bool hasTm(float TmThr, int covMode, int filterMode){ switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: - return ((qTM>= TMThr) && (tTM >= TMThr)); + return ((qTm>= TmThr) && (tTm >= TmThr)); case Parameters::COV_MODE_TARGET: - return (tTM >= TMThr); + return (tTm >= TmThr); case Parameters::COV_MODE_QUERY: - return (qTM >= TMThr); + return (qTm >= TmThr); default: return true; } @@ -111,11 +112,11 @@ class ComplexFilterCriteria { // case LocalParameters::FILTER_MODE_INTERFACE: // switch (covMode) { // case Parameters::COV_MODE_BIDIRECTIONAL: - // return (qAlnChainTMs.size()==qChainNum && qChainNum==tChainNum); + // return (qAlnChainTms.size()==qChainNum && qChainNum==tChainNum); // case Parameters::COV_MODE_TARGET: - // return (tAlnChainTMs.size()==tChainNum); + // return (tAlnChainTms.size()==tChainNum); // case Parameters::COV_MODE_QUERY: - // return (qAlnChainTMs.size()==qChainNum); + // return (qAlnChainTms.size()==qChainNum); // default: // return true; // } @@ -135,28 +136,28 @@ class ComplexFilterCriteria { // } // } - bool hasChainTm(float chainTMThr, int covMode, unsigned int qChainNum, unsigned int tChainNum) { - if (qAlnChainTMs.size()= iLddtThr) : true; - // const bool conformationOK = isConformation(filterMode, chainTMThr); - // return (covOK && TMOK && chainNumOK && chainTMOK); - return (covOK && TMOK && chainTMOK); + // const bool conformationOK = isConformation(filterMode, chainTmThr); + // return (covOK && TmOK && chainNumOK && chainTmOK); + return (covOK && TmOK && chainTmOK && lddtOK); } void updateAln(unsigned int qAlnLen, unsigned int tAlnLen) { @@ -183,29 +185,35 @@ class ComplexFilterCriteria { tTotalAlnLen += tAlnLen; } - void updateChainTm(double qChainTm, double tChainTm) { - qAlnChainTMs.push_back(qChainTm); - tAlnChainTMs.push_back(tChainTm); + void updateChainTmScore(double qChainTm, double tChainTm) { + qAlnChainTms.push_back(qChainTm); + tAlnChainTms.push_back(tChainTm); } - void fillChainAlignment(unsigned int qChainKey, unsigned int tChainKey, + void fillChainAlignment(unsigned int qChainKey, unsigned int tChainKey, unsigned int alnLen, float *qdata, float *tdata, const std::string &cigar, int qStartPos, int tStartPos, int qLen, int tLen) { - AlignedChain qChain; - AlignedChain tChain; - + AlignedCoordinate qChain; + AlignedCoordinate tChain; int qi = qStartPos; int ti = tStartPos; int mi = 0; - std::string backtrace = Matcher::uncompressAlignment(cigar); + + qChain.x.resize(alnLen); + qChain.y.resize(alnLen); + qChain.z.resize(alnLen); + tChain.x.resize(alnLen); + tChain.y.resize(alnLen); + tChain.z.resize(alnLen); + for (size_t btPos = 0; btPos < backtrace.size(); btPos++) { if (backtrace[btPos] == 'M') { - qChain.x.push_back(qdata[qi]); - qChain.y.push_back(qdata[qLen + qi]); - qChain.z.push_back(qdata[2*qLen + qi]); - tChain.x.push_back(tdata[ti]); - tChain.y.push_back(tdata[tLen + ti]); - tChain.z.push_back(tdata[2*tLen + ti]); + qChain.x[mi] = qdata[qi]; + qChain.y[mi] = qdata[qLen + qi]; + qChain.z[mi] = qdata[2*qLen + qi]; + tChain.x[mi] = tdata[ti]; + tChain.y[mi] = tdata[tLen + ti]; + tChain.z[mi] = tdata[2*tLen + ti]; qi++; ti++; mi++; @@ -221,15 +229,13 @@ class ComplexFilterCriteria { tAlnChainKeys.push_back(tChainKey); qAlnChains.push_back(qChain); tAlnChains.push_back(tChain); - alnChainLens.push_back(mi); } // void update(unsigned int qChainKey, unsigned int tChainKey, double qChainTm, double tChainTm) { - // this->qAlnChainTMs.push_back(qChainTm); - // this->tAlnChainTMs.push_back(tChainTm); + // this->qAlnChainTms.push_back(qChainTm); + // this->tAlnChainTms.push_back(tChainTm); // this->qAlnChainKeys.push_back(qChainKey); // this->tAlnChainKeys.push_back(tChainKey); - // // FIXME : What is this? // } void calcCov(unsigned int qLen, unsigned int tLen) { @@ -237,6 +243,59 @@ class ComplexFilterCriteria { tCov = static_cast(tTotalAlnLen) / static_cast(tLen); } + void computeInterfaceLddt(float threshold = 8) { + float t2 = threshold * threshold; + AlignedCoordinate qInterface; + AlignedCoordinate tInterface; + std::vector> qInterfaceLookup(qAlnChains.size()); // chainIdx, resIdx + + // Find and save interface Coordinates + for (size_t chainIdx1 = 0; chainIdx1 < qAlnChains.size(); chainIdx1++) { + for (size_t chainIdx2 = chainIdx1+1; chainIdx2 < qAlnChains.size(); chainIdx2++) { + AlignedCoordinate qChain1 = qAlnChains[chainIdx1]; + AlignedCoordinate qChain2 = qAlnChains[chainIdx2]; + AlignedCoordinate tChain1 = tAlnChains[chainIdx1]; + AlignedCoordinate tChain2 = tAlnChains[chainIdx2]; + for (size_t resIdx1 = 0; resIdx1 < qChain1.x.size(); resIdx1++) { + for (size_t resIdx2 = 0; resIdx2 < qChain2.x.size(); resIdx2++) { + float dist = BasicFunction::dist(qChain1.x[resIdx1], qChain1.y[resIdx1], qChain1.z[resIdx1], + qChain2.x[resIdx2], qChain2.y[resIdx2], qChain2.z[resIdx2]); + if (dist < t2) { + if (qInterfaceLookup[chainIdx1].find(resIdx1) == qInterfaceLookup[chainIdx1].end()) { + qInterface.x.push_back(qChain1.x[resIdx1]); + qInterface.y.push_back(qChain1.y[resIdx1]); + qInterface.z.push_back(qChain1.z[resIdx1]); + tInterface.x.push_back(tChain1.x[resIdx1]); + tInterface.y.push_back(tChain1.y[resIdx1]); + tInterface.z.push_back(tChain1.z[resIdx1]); + qInterfaceLookup[chainIdx1].insert(resIdx1); + } + if (qInterfaceLookup[chainIdx2].find(resIdx2) == qInterfaceLookup[chainIdx2].end()) { + qInterface.x.push_back(qChain2.x[resIdx2]); + qInterface.y.push_back(qChain2.y[resIdx2]); + qInterface.z.push_back(qChain2.z[resIdx2]); + tInterface.x.push_back(tChain2.x[resIdx2]); + tInterface.y.push_back(tChain2.y[resIdx2]); + tInterface.z.push_back(tChain2.z[resIdx2]); + qInterfaceLookup[chainIdx2].insert(resIdx2); + } + } + } + } + } + } + size_t intLen = qInterface.x.size(); + if (intLen == 0) { + return; + } + std::string bt(intLen, 'M'); + LDDTCalculator *lddtcalculator = NULL; + lddtcalculator = new LDDTCalculator(intLen+1, intLen+1); + lddtcalculator->initQuery(qInterface.x.size(), &qInterface.x[0], &qInterface.y[0], &qInterface.z[0]); + LDDTCalculator::LDDTScoreResult lddtres = lddtcalculator->computeLDDTScore(intLen, 0, 0, bt, &tInterface.x[0], &tInterface.y[0], &tInterface.z[0]); + interfaceLddt = lddtres.avgLddtScore; + delete lddtcalculator; + } }; void fillUArr(const std::string &uString, float (&u)[3][3]) { @@ -289,7 +348,8 @@ unsigned int cigarToAlignedLength(const std::string &cigar){ return alni; } -double computeChainTmScore(AlignedChain &qchain, AlignedChain &tchain, float t[3], float u[3][3], int tLen, unsigned int alnLen) { +double computeChainTmScore(AlignedCoordinate &qchain, AlignedCoordinate &tchain, float t[3], float u[3][3], int tLen) { + unsigned int alnLen = qchain.x.size(); double tmscore = 0; float d0 = 1.24*(cbrt(tLen-15)) -1.8; float d02 = d0*d0; @@ -311,15 +371,17 @@ void getComplexResidueLength( IndexReader *Dbr, std::vector &complexes) if (chainKeys.empty()) { continue; } - unsigned int reslen = 0; + unsigned int cmpllen = 0; for (auto chainKey: chainKeys) { size_t id = Dbr->sequenceReader->getId(chainKey); // Not accessible if (id == NOT_AVAILABLE_CHAIN_KEY) continue; - reslen += Dbr->sequenceReader->getSeqLen(id); + unsigned int reslen = Dbr->sequenceReader->getSeqLen(id); + complex->chainLengths.push_back(reslen); + cmpllen += Dbr->sequenceReader->getSeqLen(id); } - complex->complexLength = reslen; + complex->complexLength = cmpllen; } } @@ -422,7 +484,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t chainKeyToComplexId_t qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; getlookupInfo(qLookupFile, qChainKeyToComplexIdMap, qComplexes, qComplexIdToIdx); - getComplexResidueLength(qDbr, qComplexes); + getComplexResidueLength(qDbr, qComplexes); Debug::Progress progress(qComplexes.size()); std::map qComplexIdResult; @@ -518,16 +580,12 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); - cmplfiltcrit.fillChainAlignment(qChainKey, tChainKey, qdata, tdata, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); - + unsigned int alnLen = cigarToAlignedLength(res.backtrace); + cmplfiltcrit.fillChainAlignment(qChainKey, tChainKey, alnLen, qdata, tdata, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); if (par.filtChainTmThr > 0.0f) { - double chainTm = computeChainTmScore(cmplfiltcrit.qAlnChains.back(), cmplfiltcrit.tAlnChains.back(), t, u, res.dbLen, cmplfiltcrit.alnChainLens.back()); - cmplfiltcrit.updateChainTm(chainTm / res.qLen, chainTm / res.dbLen); + double chainTm = computeChainTmScore(cmplfiltcrit.qAlnChains.back(), cmplfiltcrit.tAlnChains.back(), t, u, res.dbLen); + cmplfiltcrit.updateChainTmScore(chainTm / res.qLen, chainTm / res.dbLen); } - - // // DOING - // if (par.filtInterfaceLddtThr > 0.0f) { - // } } } // while end } @@ -540,13 +598,13 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t ComplexFilterCriteria &cmplfiltcrit = assId_res.second; cmplfiltcrit.calcCov(qComplex.complexLength, tComplex.complexLength); - if (par.filtChainTmThr > 0.0) { - // TODO + + if (par.filtInterfaceLddtThr > 0.0) { + cmplfiltcrit.computeInterfaceLddt(); } - // TODO: Do something for interface LDDT // Check if the criteria are met - if (!(cmplfiltcrit.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, qComplex.nChain, tComplex.nChain))){ + if (!(cmplfiltcrit.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, par.filtInterfaceLddtThr, qComplex.nChain, tComplex.nChain))){ continue; } @@ -576,7 +634,8 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); - result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(cmplfiltcrit.qCov) + "\t" + std::to_string(cmplfiltcrit.tCov) + "\t"+ std::to_string(cmplfiltcrit.qTM)+"\t"+ std::to_string(cmplfiltcrit.tTM)+ "\n"); + // result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(cmplfiltcrit.qCov) + "\t" + std::to_string(cmplfiltcrit.tCov) + "\t"+ std::to_string(cmplfiltcrit.qTm)+"\t"+ std::to_string(cmplfiltcrit.tTm)+ "\n"); + result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(cmplfiltcrit.qCov) + "\t" + std::to_string(cmplfiltcrit.tCjjov) + "\t"+ std::to_string(cmplfiltcrit.qTm)+"\t"+ std::to_string(cmplfiltcrit.tTm)+"\t"+ std::to_string(cmplfiltcrit.interfaceLddt)+"\n"); } #pragma omp critical From 22d24ffc231fe4c0d30de125332e1f6cb3dcdfef Mon Sep 17 00:00:00 2001 From: rachelse Date: Tue, 30 Jul 2024 17:53:34 +0900 Subject: [PATCH 123/160] Separated interface retrieving and saving --- src/strucclustutils/filtermultimer.cpp | 60 ++++++++++++++++---------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 35cdee2d..5411f2e6 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -38,6 +38,17 @@ struct AlignedCoordinate { std::vector x; std::vector y; std::vector z; + AlignedCoordinate() {} + AlignedCoordinate(size_t size) { + x.resize(size); + y.resize(size); + z.resize(size); + } + ~AlignedCoordinate() { + x.clear(); + y.clear(); + z.clear(); + } }; unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { @@ -245,10 +256,8 @@ class ComplexFilterCriteria { void computeInterfaceLddt(float threshold = 8) { float t2 = threshold * threshold; - AlignedCoordinate qInterface; - AlignedCoordinate tInterface; - std::vector> qInterfaceLookup(qAlnChains.size()); // chainIdx, resIdx - + std::vector> qInterfacePos(qAlnChains.size()); // chainIdx, resIdx + unsigned int intLen = 0; // Find and save interface Coordinates for (size_t chainIdx1 = 0; chainIdx1 < qAlnChains.size(); chainIdx1++) { for (size_t chainIdx2 = chainIdx1+1; chainIdx2 < qAlnChains.size(); chainIdx2++) { @@ -261,37 +270,42 @@ class ComplexFilterCriteria { float dist = BasicFunction::dist(qChain1.x[resIdx1], qChain1.y[resIdx1], qChain1.z[resIdx1], qChain2.x[resIdx2], qChain2.y[resIdx2], qChain2.z[resIdx2]); if (dist < t2) { - if (qInterfaceLookup[chainIdx1].find(resIdx1) == qInterfaceLookup[chainIdx1].end()) { - qInterface.x.push_back(qChain1.x[resIdx1]); - qInterface.y.push_back(qChain1.y[resIdx1]); - qInterface.z.push_back(qChain1.z[resIdx1]); - tInterface.x.push_back(tChain1.x[resIdx1]); - tInterface.y.push_back(tChain1.y[resIdx1]); - tInterface.z.push_back(tChain1.z[resIdx1]); - qInterfaceLookup[chainIdx1].insert(resIdx1); + if (qInterfacePos[chainIdx1].find(resIdx1) == qInterfacePos[chainIdx1].end()) { + qInterfacePos[chainIdx1].insert(resIdx1); + intLen++; } - if (qInterfaceLookup[chainIdx2].find(resIdx2) == qInterfaceLookup[chainIdx2].end()) { - qInterface.x.push_back(qChain2.x[resIdx2]); - qInterface.y.push_back(qChain2.y[resIdx2]); - qInterface.z.push_back(qChain2.z[resIdx2]); - tInterface.x.push_back(tChain2.x[resIdx2]); - tInterface.y.push_back(tChain2.y[resIdx2]); - tInterface.z.push_back(tChain2.z[resIdx2]); - qInterfaceLookup[chainIdx2].insert(resIdx2); + if (qInterfacePos[chainIdx2].find(resIdx2) == qInterfacePos[chainIdx2].end()) { + qInterfacePos[chainIdx2].insert(resIdx2); + intLen++; } } } } } } - size_t intLen = qInterface.x.size(); + if (intLen == 0) { return; } + AlignedCoordinate qInterface(intLen); + AlignedCoordinate tInterface(intLen); + + size_t idx = 0; + for (size_t chainIdx = 0; chainIdx < qInterfacePos.size(); chainIdx++) { + for (size_t resIdx: qInterfacePos[chainIdx]) { + qInterface.x[idx] = qAlnChains[chainIdx].x[resIdx]; + qInterface.y[idx] = qAlnChains[chainIdx].y[resIdx]; + qInterface.z[idx] = qAlnChains[chainIdx].z[resIdx]; + tInterface.x[idx] = tAlnChains[chainIdx].x[resIdx]; + tInterface.y[idx] = tAlnChains[chainIdx].y[resIdx]; + tInterface.z[idx] = tAlnChains[chainIdx].z[resIdx]; + idx++; + } + } std::string bt(intLen, 'M'); LDDTCalculator *lddtcalculator = NULL; lddtcalculator = new LDDTCalculator(intLen+1, intLen+1); - lddtcalculator->initQuery(qInterface.x.size(), &qInterface.x[0], &qInterface.y[0], &qInterface.z[0]); + lddtcalculator->initQuery(intLen, &qInterface.x[0], &qInterface.y[0], &qInterface.z[0]); LDDTCalculator::LDDTScoreResult lddtres = lddtcalculator->computeLDDTScore(intLen, 0, 0, bt, &tInterface.x[0], &tInterface.y[0], &tInterface.z[0]); interfaceLddt = lddtres.avgLddtScore; delete lddtcalculator; @@ -635,7 +649,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); // result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(cmplfiltcrit.qCov) + "\t" + std::to_string(cmplfiltcrit.tCov) + "\t"+ std::to_string(cmplfiltcrit.qTm)+"\t"+ std::to_string(cmplfiltcrit.tTm)+ "\n"); - result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(cmplfiltcrit.qCov) + "\t" + std::to_string(cmplfiltcrit.tCjjov) + "\t"+ std::to_string(cmplfiltcrit.qTm)+"\t"+ std::to_string(cmplfiltcrit.tTm)+"\t"+ std::to_string(cmplfiltcrit.interfaceLddt)+"\n"); + result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(cmplfiltcrit.qCov) + "\t" + std::to_string(cmplfiltcrit.tCov) + "\t"+ std::to_string(cmplfiltcrit.qTm)+"\t"+ std::to_string(cmplfiltcrit.tTm)+"\t"+ std::to_string(cmplfiltcrit.interfaceLddt)+"\n"); } #pragma omp critical From 0f6bb3cc1bf70e50ff7c339822b15b3811438451 Mon Sep 17 00:00:00 2001 From: rachelse Date: Tue, 30 Jul 2024 17:54:28 +0900 Subject: [PATCH 124/160] Deleted original interfaceLDDT code file --- src/strucclustutils/filtercomplex_origin.cpp | 921 ------------------- 1 file changed, 921 deletions(-) delete mode 100644 src/strucclustutils/filtercomplex_origin.cpp diff --git a/src/strucclustutils/filtercomplex_origin.cpp b/src/strucclustutils/filtercomplex_origin.cpp deleted file mode 100644 index 5d3664f0..00000000 --- a/src/strucclustutils/filtercomplex_origin.cpp +++ /dev/null @@ -1,921 +0,0 @@ -#include "DBWriter.h" -#include "Util.h" -#include "LocalParameters.h" -#include "Matcher.h" -#include "Debug.h" -#include "DBReader.h" -#include "IndexReader.h" -#include "FileUtil.h" -#include "MemoryMapped.h" -#include "Coordinate16.h" -#include "tmalign/basic_fun.h" -#include "createcomplexreport.h" -#include "LDDT.h" -#include "CalcProbTP.h" -#include - - -#ifdef OPENMP -#include -#endif - - -unsigned int adjustAlnLen(unsigned int qcov, unsigned int tcov, int covMode) { - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - return (qcov+tcov)/2; - case Parameters::COV_MODE_TARGET: - return qcov; - case Parameters::COV_MODE_QUERY: - return tcov; - case Parameters::COV_MODE_LENGTH_QUERY : - case Parameters::COV_MODE_LENGTH_TARGET : - case Parameters::COV_MODE_LENGTH_SHORTER : - return 0; - } -} - -// bool hasChainnum(bool sameChainNum, int qChainNum, int tChainNum){ -// switch (sameChainNum){ -// case 1: -// if (qChainNum != tChainNum){ -// return false; -// }else{return true;} -// case 0: -// return true; -// } -// } - -bool hasChainTm(float chainTMThr, int covMode, std::vector &qChainTmScores, std::vector &tChainTmScores, unsigned int qChainNum, unsigned int tChainNum) { - if (chainTMThr > 0 ){ - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - if (qChainTmScores.size() coords; - unsigned int coordNum; -}; - -struct ChainForInterface { - std::vector indexVec; - unsigned int chainid; - - ChainForInterface(int idx) - : chainid(idx) {} -}; - -struct Interface{ - std::vector chainsinInterface; -}; - -struct InterfaceForLDDT{ - std::vector chainsinInterface; -}; - -struct Complex { - std::vector InterfaceVec; -}; - -struct ComplexForLDDT { - std::vector InterfaceVec; -}; - -void findInterface(float* qdata, int qLen, unsigned int qChainDbId, float* qdata2, int qLen2, unsigned int qChainDbId2, float distanceThreshold, Interface &interface) { - ChainForInterface qtmpChain(qChainDbId), qtmpChain2(qChainDbId2); - for (int qpos = 0; qpos tposIndex, qposIndex, qIndextIndex; - for (size_t i = 0; i< tChain.indexVec.size(); i++){ - tposIndex[tChain.indexVec[i]] = i; - } - for (size_t i = 0; i< qChain.indexVec.size(); i++){ - qposIndex[qChain.indexVec[i]] = i; - } - int qi = qStartPos; - int ti = tStartPos; - int mi = 0; - for (size_t btPos = 0; btPos < backtrace.size(); btPos++) { - if (backtrace[btPos] == 'M') { - qi++; - ti++; - mi++; - } - else if (backtrace[btPos] == 'I') { - qi++; - } - else { - ti++; - } - if (qposIndex.find(qi) != qposIndex.end()){ - if(tposIndex.find(ti) != tposIndex.end()){ - qIndextIndex[qposIndex[qi]] = tposIndex[ti]; - } - } - } - qNewChain.coordNum = qIndextIndex.size(); - tNewChain.coordNum = qIndextIndex.size(); - for (auto &pair : qIndextIndex){ - qNewChain.coords.push_back(qdata[qChain.indexVec[pair.first]]); - tNewChain.coords.push_back(tdata[tChain.indexVec[pair.second]]); - } - for (auto &pair : qIndextIndex){ - qNewChain.coords.push_back(qdata[qSeqLen + qChain.indexVec[pair.first]]); - tNewChain.coords.push_back(tdata[tSeqLen + tChain.indexVec[pair.second]]); - } - for (auto &pair : qIndextIndex){ - qNewChain.coords.push_back(qdata[2*qSeqLen + qChain.indexVec[pair.first]]); - tNewChain.coords.push_back(tdata[2*tSeqLen + tChain.indexVec[pair.second]]); - } -} - -class ComplexFilterCriteria { -public: - unsigned int dbKey; - unsigned int qTotalAlnLen; - unsigned int tTotalAlnLen; - float qCov; - float tCov; - double qTM; - double tTM; - Complex tComplex; - ComplexForLDDT qnewComplex; - ComplexForLDDT tnewComplex; - std::vector alignedQChainTmScores; - std::vector alignedTChainTmScores; - std::vector intLDDTScores; - ComplexFilterCriteria() {} - ComplexFilterCriteria(unsigned int dbKey, unsigned int qTotalAlnLen, unsigned int tTotalAlnLen, double qTM, double tTM, double qChainTm, double tChainTm, Complex &tComplex, ComplexForLDDT &qnewComplex, ComplexForLDDT &tnewComplex) : - dbKey(dbKey), qTotalAlnLen(qTotalAlnLen), tTotalAlnLen(tTotalAlnLen), qTM(qTM), tTM(tTM), tComplex(tComplex), qnewComplex(qnewComplex), tnewComplex(tnewComplex) { - alignedQChainTmScores.push_back(qChainTm); - alignedTChainTmScores.push_back(tChainTm); - } - ~ComplexFilterCriteria() { - alignedQChainTmScores.clear(); - alignedTChainTmScores.clear(); - } - - bool hasTM(float TMThr, int covMode, int filterMode){ - switch (filterMode){ - case LocalParameters::FILTER_MODE_INTERFACE: - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - return ((qTM>= TMThr) && (tTM >= TMThr)); - case Parameters::COV_MODE_TARGET: - return (tTM >= TMThr); - case Parameters::COV_MODE_QUERY: - return (qTM >= TMThr); - case Parameters::COV_MODE_LENGTH_QUERY : - case Parameters::COV_MODE_LENGTH_TARGET : - case Parameters::COV_MODE_LENGTH_SHORTER : - return true; - } - } - } - - bool hasChainNum(int covMode, int filterMode, int qChainNum, int tChainNum ){ - switch (filterMode){ - case LocalParameters::FILTER_MODE_INTERFACE: - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - return (alignedQChainTmScores.size()==qChainNum && qChainNum==tChainNum); - case Parameters::COV_MODE_TARGET: - return (alignedTChainTmScores.size()==tChainNum); - case Parameters::COV_MODE_QUERY: - return (alignedQChainTmScores.size()==qChainNum); - case Parameters::COV_MODE_LENGTH_QUERY : - case Parameters::COV_MODE_LENGTH_TARGET : - case Parameters::COV_MODE_LENGTH_SHORTER : - return true; - } - case LocalParameters::FILTER_MODE_CONFORMATION: - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - return (qChainNum==tChainNum); - default: - return true; - } - case LocalParameters::FILTER_MODE_LOOSE: - return true; - - } - } - - // bool hasMatchedCoord(int filterMode){ - // switch (filterMode) { - // case LocalParameters::FILTER_MODE_INTERFACE: - // return (sameCoord); - // case LocalParameters::FILTER_MODE_CONFORMATION: - // case LocalParameters::FILTER_MODE_LOOSE: - // return true; - // } - // } - - void calLDDT( std::map &qIntpostotIntpos ){ - float* qdatatmp1; - float* qdatatmp2; - float* tdatatmp1; - float* tdatatmp2; - - for (auto pair : qIntpostotIntpos){ - size_t seqlength1 = qnewComplex.InterfaceVec[pair.first].chainsinInterface[0].coords.size()/3; - size_t seqlength2 = qnewComplex.InterfaceVec[pair.first].chainsinInterface[1].coords.size()/3; - if (seqlength1 == 0 || seqlength2 == 0){ - continue; - } - qdatatmp1 = qnewComplex.InterfaceVec[pair.first].chainsinInterface[0].coords.data(); - qdatatmp2 = qnewComplex.InterfaceVec[pair.first].chainsinInterface[1].coords.data(); - tdatatmp1 = tnewComplex.InterfaceVec[pair.second].chainsinInterface[0].coords.data(); - tdatatmp2 = tnewComplex.InterfaceVec[pair.second].chainsinInterface[1].coords.data(); - - LDDTCalculator *lddtcalculator = NULL; - lddtcalculator = new LDDTCalculator(seqlength1 + 1, seqlength1 + 1); - lddtcalculator->initQuery(seqlength1, qdatatmp1, &qdatatmp1[seqlength1], &qdatatmp1[2*seqlength1]); - LDDTCalculator::LDDTScoreResult lddtres; - std::string backtrace(seqlength1, 'M'); - lddtres = lddtcalculator->computeLDDTScore(seqlength1, 0, 0, backtrace, tdatatmp1, &tdatatmp1[seqlength1], &tdatatmp1[2*seqlength1]); - double lddtresScore = lddtres.avgLddtScore; - - delete lddtcalculator; - lddtcalculator = new LDDTCalculator(seqlength2 + 1, seqlength2 + 1); - lddtcalculator->initQuery(seqlength2, qdatatmp2, &qdatatmp2[seqlength2], &qdatatmp2[2*seqlength2]); - LDDTCalculator::LDDTScoreResult lddtres2; - std::string backtrace2(seqlength2, 'M'); - lddtres2 = lddtcalculator->computeLDDTScore(seqlength2, 0, 0, backtrace2, tdatatmp2, &tdatatmp2[seqlength2], &tdatatmp2[2*seqlength2]); - lddtresScore += lddtres2.avgLddtScore; - intLDDTScores.push_back(lddtresScore/2); - delete lddtcalculator; - } - } - - bool hasintLDDT (int covMode, int filterMode, float intLDDTThr){ - switch (filterMode){ - case LocalParameters::FILTER_MODE_INTERFACE: - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - // if (intLDDTScores.size()qTotalAlnLen += qTotalAlnLen; - this->tTotalAlnLen += tTotalAlnLen; - this->alignedQChainTmScores.push_back(qChainTm); - this->alignedTChainTmScores.push_back(tChainTm); - } - - void calcCov(unsigned int qLen, unsigned int tLen) { - qCov = static_cast(qTotalAlnLen) / static_cast(qLen); - tCov = static_cast(tTotalAlnLen) / static_cast(tLen); - } -}; - -void fillUArr(const std::string &uString, float (&u)[3][3]) { - std::string tmp; - int i = 0; - int j=0; - const int ulen = static_cast(uString.size()); - for (int k=0; k < ulen; k++) { - if (k==ulen-1) { - u[i][j] = std::stof(tmp); - } else if (uString[k] == ',') { - u[i][j] = std::stof(tmp); - tmp.clear(); - j++; - } else { - tmp.push_back(uString[k]); - } - if (j == 3) { - i++; - j = 0; - } - } -} - -void fillTArr(const std::string &tString, float (&t)[3]) { - std::string tmp; - int i = 0; - const int tlen = static_cast(tString.size()); - for (int k=0; k &ChainKeys) { - unsigned int ResidueLen = 0; - for (auto ChainKey: ChainKeys) { - size_t id = Dbr->sequenceReader->getId(ChainKey); - // Not accessible - if (id == NOT_AVAILABLE_CHAIN_KEY) - return 0; - ResidueLen += Dbr->sequenceReader->getSeqLen(id); - } - return ResidueLen; -} - -static void getlookupInfo( - const std::string &file, - std::map &complexIdtoName, - std::map &chainKeyToComplexIdLookup, - std::map> &complexIdToChainKeysLookup, - std::vector &complexIdVec -) { - if (file.length() == 0) { - return; - } - MemoryMapped lookupDB(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); - char *data = (char *) lookupDB.getData(); - char *end = data + lookupDB.mappedSize(); - const char *entry[255]; - int prevComplexId = -1; - while (data < end && *data != '\0') { - const size_t columns = Util::getWordsOfLine(data, entry, 255); - if (columns < 3) { - Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n"; - continue; - } - auto chainKey = Util::fast_atoi(entry[0]); - std::string chainName(entry[1], (entry[2] - entry[1]) - 1); - auto complexId = Util::fast_atoi(entry[2]); - chainKeyToComplexIdLookup.emplace(chainKey, complexId); - - size_t lastUnderscoreIndex = chainName.find_last_of('_'); - std::string complexName = chainName.substr(0, lastUnderscoreIndex); - - if (complexId != prevComplexId) { - complexIdToChainKeysLookup.emplace(complexId, std::vector()); - complexIdVec.emplace_back(complexId); - complexIdtoName.emplace(complexId, complexName); - prevComplexId = complexId; - } - complexIdToChainKeysLookup.at(complexId).emplace_back(chainKey); - data = Util::skipLine(data); - } - lookupDB.close(); -} - -// static void getInterfaceIndex(float* &data1, int size1, float* &data2, int size2, int interfaceDistThr, std::vector &x, std::vector &y, std::vector &z){ -// unsigned int interLen = 0; -// for (int i =0; i < size1; i ++){ -// for (int j =0; j < size2; j ++){ -// float atomDist = BasicFunction::dist(data1[i], data1[i+size1], data1[i+size1*2], data2[j], data2[j+size2], data2[j+size2*2]); -// if (atomDist<= interfaceDistThr){ -// x.push_back(data1[i]); -// y.push_back(data1[i+size1]); -// z.push_back(data1[i+size1*2]); -// x.push_back(data2[j]); -// y.push_back(data2[j+size2]); -// z.push_back(data2[j+size2*2]); -// interLen ++; -// } -// } -// } -// Coordinates -// } - -// static void getInterface(std::vector &ChainKeys, DBReader &StructDbr, std::vector &interfaceIndexVec, IndexReader* &DBr, unsigned int thread_idx, int interfaceDistThr){ -// for (size_t i = 0; i < ChainKeys.size(); ++i) { -// unsigned int ChainDbId1 = DBr->sequenceReader->getId(ChainKeys[i]); -// char *cadata1 = StructDbr.getData(ChainDbId1, thread_idx); -// size_t CaLength1 = StructDbr.getEntryLen(ChainDbId1); -// size_t seqLength1 = StructDbr.getSeqLen(ChainDbId1); -// Coordinate16 coords1; -// float* data1 = coords1.read(cadata1, seqLength1, CaLength1); -// for (size_t j = i + 1; j < ChainKeys.size(); ++j) { -// unsigned int ChainDbId2 = DBr->sequenceReader->getId(ChainKeys[j]); -// char *cadata2 = StructDbr.getData(ChainDbId2, thread_idx); -// size_t CaLength2 = StructDbr.getEntryLen(ChainDbId2); -// size_t seqLength2 = StructDbr.getSeqLen(ChainDbId2); -// Coordinate16 coords2; -// float* data2 = coords2.read(cadata2, seqLength2, CaLength2); -// std::vector x; -// std::vector y; -// std::vector z; -// getInterfaceIndex(data1, seqLength1, data2, seqLength2, interfaceDistThr, x, y, z); -// interfaceIndexVec.push_back(interfaceCoord); -// } -// } -// } - -int filtercomplex(int argc, const char **argv, const Command &command) { - LocalParameters &par = LocalParameters::getLocalInstance(); - par.parseParameters(argc, argv, command, true, 0, 0); - const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; - const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); - int dbaccessMode = (DBReader::USE_INDEX); - - IndexReader* qDbr = NULL; - qDbr = new IndexReader(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); - DBReader qStructDbr((par.db1 + "_ca").c_str(), (par.db1 + "_ca.index").c_str(), - par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); - qStructDbr.open(DBReader::NOSORT); - - IndexReader* tDbr = NULL; - DBReader *tStructDbr = NULL; - if (sameDB) { - tDbr = qDbr; - tStructDbr = &qStructDbr; - } - else{ - tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); - tStructDbr = new DBReader((par.db2 + "_ca").c_str(), (par.db2 + "_ca.index").c_str(), - par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); - tStructDbr->open(DBReader::NOSORT); - } - DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX| DBReader::USE_DATA); - alnDbr.open(DBReader::LINEAR_ACCCESS); - size_t localThreads = 1; - - // Debug(Debug::WARNING) << "Monomer will be treated as singleton\nMonomer chain key: \n"; -#ifdef OPENMP -localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); -#endif - const bool shouldCompress = (par.compressed == true); - const int db4Type = Parameters::DBTYPE_CLUSTER_RES; - - DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), 1, shouldCompress, db4Type); - resultWriter.open(); - - const int db5Type = Parameters::DBTYPE_GENERIC_DB; - //TODO: remove resultWrite5 when done - DBWriter resultWrite5(par.db5.c_str(), par.db5Index.c_str(), 1, shouldCompress, db5Type); - resultWrite5.open(); - - std::string qLookupFile = par.db1 + ".lookup"; - std::string tLookupFile = par.db2 + ".lookup"; - - chainKeyToComplexId_t qChainKeyToComplexIdMap, tChainKeyToComplexIdMap; - complexIdToChainKeys_t qComplexIdToChainKeyMap, tComplexIdToChainKeyMap; - std::map qcomplexIdToName, tcomplexIdToName; - std::vector qComplexIdVec, tComplexIdVec; - getlookupInfo(qLookupFile, qcomplexIdToName,qChainKeyToComplexIdMap, qComplexIdToChainKeyMap, qComplexIdVec); - getlookupInfo(tLookupFile, tcomplexIdToName, tChainKeyToComplexIdMap, tComplexIdToChainKeyMap, tComplexIdVec); - qChainKeyToComplexIdMap.clear(); - // Debug::Progress progress(qComplexIdVec.size()); - std::map qComplexLength, tComplexLength; - std::map qComplexIdResult; - - for (size_t tComplexIdx = 0; tComplexIdx < tComplexIdVec.size(); tComplexIdx++) { - unsigned int tComplexId = tComplexIdVec[tComplexIdx]; - std::vector &tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); - if (tChainKeys.empty()) { - continue; - } - unsigned int reslen = getComplexResidueLength(tDbr, tChainKeys); - tComplexLength[tComplexId] =reslen; - } - for (size_t qComplexIdx = 0; qComplexIdx < qComplexIdVec.size(); qComplexIdx++) { - unsigned int qComplexId = qComplexIdVec[qComplexIdx]; - std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); - if (qChainKeys.empty()) { - continue; - } - unsigned int reslen = getComplexResidueLength(qDbr, qChainKeys); - qComplexLength[qComplexId] = reslen; - } - - - -#pragma omp parallel num_threads(localThreads) - { - resultToWrite_t result5; - char buffer[32]; - unsigned int thread_idx = 0; - //TODO: set threshold for interface. now 15A - int interfaceDistThr =15; -#ifdef OPENMP - thread_idx = static_cast(omp_get_thread_num()); -#endif - std::string result; - std::map localComplexMap; - std::vector assIdsToDelete; - std::map> cmplIdToBestAssId; // cmplId : [assId, alnSum] - std::vector selectedAssIDs; - Matcher::result_t res; - std::map> assIDtoqChainIdtotChainId; - std::map> assIdtoqIntpostotIntpos; - -#pragma omp for schedule(dynamic, 1) - for (size_t queryComplexIdx = 0; queryComplexIdx < qComplexIdVec.size(); queryComplexIdx++) { - Coordinate16 qcoords; - Coordinate16 tcoords; - // progress.updateProgress(); - unsigned int qComplexId = qComplexIdVec[queryComplexIdx]; - std::vector &qChainKeys = qComplexIdToChainKeyMap.at(qComplexId); - - // std::vector> qInterfaceIndexVec; - // getInterface(qChainKeys, qStructDbr, qInterfaceIndexVec, qDbr, thread_idx, interfaceDistThr); - Complex qComplex; - for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ){ - unsigned int qChainKey = qChainKeys[qChainIdx]; - unsigned int qChainAlnId = alnDbr.getId(qChainKey); - //TODO: if monomer - if (qChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ - continue; - } - unsigned int qChainDbId = qDbr->sequenceReader->getId(qChainKey); - char *data = alnDbr.getData(qChainAlnId, thread_idx); - char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); - size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); - size_t qSeqlen = qDbr->sequenceReader->getSeqLen(qChainDbId); - float* qdata = qcoords.read(qcadata, qSeqlen, qCaLength); - while (*data != '\0' ) { - ComplexDataHandler retComplex = parseScoreComplexResult(data, res); - data = Util::skipLine(data); - unsigned int assId = retComplex.assId; - unsigned int tChainKey= res.dbKey; - unsigned int tChainAlnId = alnDbr.getId(tChainKey); - unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); - if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ - continue; - } - assIDtoqChainIdtotChainId[assId][qChainDbId] = tChainDbId ; - } - for (size_t qChainIdx2 = qChainIdx+1; qChainIdx2 < qChainKeys.size(); qChainIdx2++ ){ - unsigned int qChainKey2 = qChainKeys[qChainIdx2]; - unsigned int qChainAlnId2 = alnDbr.getId(qChainKey2); - unsigned int qChainDbId2 = qDbr->sequenceReader->getId(qChainKey2); - char *qcadata2 = qStructDbr.getData(qChainDbId2, thread_idx); - size_t qCaLength2 = qStructDbr.getEntryLen(qChainDbId2); - size_t qSeqlen2 = qDbr->sequenceReader->getSeqLen(qChainDbId2); - float* qdata2 = qcoords.read(qcadata2, qSeqlen2, qCaLength2); - Interface interface; - findInterface(qdata, qSeqlen, qChainDbId, qdata2, qSeqlen2, qChainDbId2, interfaceDistThr, interface); - if (interface.chainsinInterface[0].indexVec.size() > 0){ - qComplex.InterfaceVec.push_back(interface); - } - } - } - for (size_t qChainIdx = 0; qChainIdx < qChainKeys.size(); qChainIdx++ ) { - unsigned int qChainKey = qChainKeys[qChainIdx]; - unsigned int qChainAlnId = alnDbr.getId(qChainKey); - unsigned int qChainDbId = qDbr->sequenceReader->getId(qChainKey); - //handling monomer as singleton - if (qChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ - char *outpos = Itoa::u32toa_sse2(qComplexId, buffer); - result.append(buffer, (outpos - buffer - 1)); - result.push_back('\n'); - result5.append(qcomplexIdToName.at(qComplexId) + "\t" + tcomplexIdToName.at(qComplexId) + "\t1.000000\t1.000000\t1.000000\t1.000000\n"); - break; - } - - char *data = alnDbr.getData(qChainAlnId, thread_idx); - while (*data != '\0' ) { - ComplexDataHandler retComplex = parseScoreComplexResult(data, res); - char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); - size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); - float* qdata = qcoords.read(qcadata, res.qLen, qCaLength); - unsigned int qSeqlen = qDbr->sequenceReader->getSeqLen(qChainDbId); - if (!retComplex.isValid){ - Debug(Debug::ERROR) << "No scorecomplex result provided"; - EXIT(EXIT_FAILURE); - } - - data = Util::skipLine(data); - unsigned int assId = retComplex.assId; - unsigned int tChainKey= res.dbKey; - unsigned int tChainAlnId = alnDbr.getId(tChainKey); - unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); - //if target is monomer, break to be singleton - if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ - continue; - } - unsigned int tComplexId = tChainKeyToComplexIdMap.at(tChainKey); - std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); - float u[3][3]; - float t[3]; - fillUArr(retComplex.uString, u); - fillTArr(retComplex.tString, t); - char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); - size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); - unsigned int tSeqlen = tDbr->sequenceReader->getSeqLen(tChainDbId); - float* tdata = tcoords.read(tcadata, res.dbLen, tCaLength); - unsigned int alnLen = cigarToAlignedLength(res.backtrace); - Coordinates qm(alnLen), tm(alnLen); - fillMatchedCoord(qdata, tdata, qm, tm, res.backtrace, res.qStartPos, res.dbStartPos, res.qLen, res.dbLen); - double chainTm = computeChainTmScore(qm, tm, t, u, alnLen, res.dbLen); - double qChainTm = chainTm / res.qLen; - double tChainTm = chainTm/ res.dbLen; - unsigned int qtotalaln = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); - unsigned int ttotalaln = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - if (localComplexMap.find(assId) == localComplexMap.end()) { - Complex tComplex; - for (size_t tChainIdx = 0; tChainIdx < tChainKeys.size(); tChainIdx++ ){ - unsigned int tChainKey = tChainKeys[tChainIdx]; - unsigned int tChainAlnId = alnDbr.getId(tChainKey); - unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); - //TODO: if monomer - if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ - break; - } - - char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); - size_t tCaLength = tStructDbr->getEntryLen(tChainDbId); - float* tdata = tcoords.read(tcadata, tSeqlen, tCaLength); - - for (size_t tChainIdx2 = tChainIdx+1; tChainIdx2 < tChainKeys.size(); tChainIdx2++ ){ - unsigned int tChainKey2 = tChainKeys[tChainIdx2]; - unsigned int tChainAlnId2 = alnDbr.getId(tChainKey2); - unsigned int tChainDbId2 = tDbr->sequenceReader->getId(tChainKey2); - char *tcadata2 = tStructDbr->getData(tChainDbId2, thread_idx); - size_t tCaLength2 = tStructDbr->getEntryLen(tChainDbId2); - size_t tSeqlen2 = tDbr->sequenceReader->getSeqLen(tChainDbId2); - float* tdata2 = tcoords.read(tcadata2, tSeqlen2, tCaLength2); - Interface interface; - findInterface(tdata, tSeqlen, tChainDbId, tdata2, tSeqlen2, tChainDbId2, interfaceDistThr, interface); - if (interface.chainsinInterface[0].indexVec.size() > 0){ - tComplex.InterfaceVec.push_back(interface); - } - } - } - - ComplexForLDDT qnewComplex, tnewComplex; - for(size_t intnum=0 ; intnum <= qComplex.InterfaceVec.size(); intnum ++){ - InterfaceForLDDT inter; - qnewComplex.InterfaceVec.push_back(inter); - } - for(size_t intnum=0 ; intnum <= tComplex.InterfaceVec.size(); intnum ++){ - InterfaceForLDDT inter; - tnewComplex.InterfaceVec.push_back(inter); - } - for (size_t qIntpos = 0; qIntpos < qComplex.InterfaceVec.size(); qIntpos++){ - for (size_t tIntpos = 0; tIntpos < tComplex.InterfaceVec.size(); tIntpos++){ - if (assIDtoqChainIdtotChainId[assId].find(qComplex.InterfaceVec[qIntpos].chainsinInterface[0].chainid)!= assIDtoqChainIdtotChainId[assId].end() && assIDtoqChainIdtotChainId[assId].find(qComplex.InterfaceVec[qIntpos].chainsinInterface[1].chainid)!= assIDtoqChainIdtotChainId[assId].end()){ - if (tComplex.InterfaceVec[tIntpos].chainsinInterface[0].chainid == assIDtoqChainIdtotChainId[assId][qComplex.InterfaceVec[qIntpos].chainsinInterface[0].chainid] && tComplex.InterfaceVec[tIntpos].chainsinInterface[1].chainid == assIDtoqChainIdtotChainId[assId][qComplex.InterfaceVec[qIntpos].chainsinInterface[1].chainid]){ - assIdtoqIntpostotIntpos[assId][qIntpos] = tIntpos ; - } - else if (tComplex.InterfaceVec[tIntpos].chainsinInterface[1].chainid == assIDtoqChainIdtotChainId[assId][qComplex.InterfaceVec[qIntpos].chainsinInterface[0].chainid] && tComplex.InterfaceVec[tIntpos].chainsinInterface[0].chainid == assIDtoqChainIdtotChainId[assId][qComplex.InterfaceVec[qIntpos].chainsinInterface[1].chainid]) { - assIdtoqIntpostotIntpos[assId][qIntpos] = tIntpos ; - } - } - } - } - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tChainKey, qtotalaln, ttotalaln, retComplex.qTmScore, retComplex.tTmScore, qChainTm, tChainTm, tComplex, qnewComplex, tnewComplex); - localComplexMap[assId] = cmplfiltcrit; - } else { - localComplexMap.at(assId).update(qtotalaln, ttotalaln, qChainTm, tChainTm); - } - for (auto pair : assIdtoqIntpostotIntpos[assId]){ - for (ChainForInterface qChain : qComplex.InterfaceVec[pair.first].chainsinInterface){ - for (ChainForInterface tChain : localComplexMap.at(assId).tComplex.InterfaceVec[pair.second].chainsinInterface){ - if(qChain.chainid == qChainDbId && tChain.chainid == tChainDbId){ - ChainForLDDT qChaintmp, tChaintmp; - AlignedInterface(qdata, tdata, qSeqlen, tSeqlen, qChain, tChain, res.backtrace, res.qStartPos, res.dbStartPos, qChaintmp, tChaintmp); - localComplexMap.at(assId).qnewComplex.InterfaceVec[pair.first].chainsinInterface.push_back(qChaintmp); - localComplexMap.at(assId).tnewComplex.InterfaceVec[pair.second].chainsinInterface.push_back(tChaintmp); - } - } - } - } - } // while end - } - for (auto& assId_res : localComplexMap){ - unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); - assId_res.second.calcCov(qComplexLength.at(qComplexId), tComplexLength.at(tComplexId)); - if (!assIdtoqIntpostotIntpos[assId_res.first].empty()){ - assId_res.second.calLDDT(assIdtoqIntpostotIntpos[assId_res.first]); - } - std::vector tChainKeys = tComplexIdToChainKeyMap.at(tComplexId); - if (!assId_res.second.satisfy(par.covMode, par.filterMode, par.covThr, par.filtComplexTmThr, par.filtChainTmThr, par.intLDDTThr, qChainKeys.size(), tChainKeys.size())){ - assIdsToDelete.push_back(assId_res.first); - } - } - - for (const auto& key : assIdsToDelete) { - localComplexMap.erase(key); - } - - for (const auto& assId_res : localComplexMap){ - unsigned int tComplexId = tChainKeyToComplexIdMap.at(assId_res.second.dbKey); - unsigned int alnlen = adjustAlnLen(assId_res.second.qTotalAlnLen, assId_res.second.tTotalAlnLen, par.covMode); - if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()){ - cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; - } - else { - if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]){ - cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; - } - } - } - - for (const auto& pair : cmplIdToBestAssId){ - selectedAssIDs.push_back(pair.second[0]); - } - - for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++){ - unsigned int assId = selectedAssIDs[assIdidx]; - unsigned int tComplexId = tChainKeyToComplexIdMap.at(localComplexMap.at(assId).dbKey); - char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); - result.append(buffer, (outpos - buffer - 1)); - result.push_back('\n'); - result5.append(qcomplexIdToName.at(qComplexId) + "\t" + tcomplexIdToName.at(tComplexId) + "\t" + std::to_string(localComplexMap.at(assId).qCov) + "\t" + std::to_string(localComplexMap.at(assId).tCov) + "\t"+ std::to_string(localComplexMap.at(assId).qTM)+"\t"+ std::to_string(localComplexMap.at(assId).tTM)+ "\n"); - } - #pragma omp critical - { - qComplexIdResult[qComplexId]= result; - } - result.clear(); - localComplexMap.clear(); - assIdsToDelete.clear(); - cmplIdToBestAssId.clear(); - selectedAssIDs.clear(); - assIdtoqIntpostotIntpos.clear(); - assIDtoqChainIdtotChainId.clear(); - } // for end - #pragma omp critical - { - resultWrite5.writeData(result5.c_str(), result5.length(), 0); - } - result5.clear(); - } // MP end - for (auto &pair : qComplexIdResult){ - resultWriter.writeData(pair.second.c_str(), pair.second.length(), pair.first); - } - - resultWriter.close(true); - resultWrite5.close(par.dbOut == false); - qStructDbr.close(); - alnDbr.close(); - delete qDbr; - if (sameDB == false) { - delete tDbr; - delete tStructDbr; - } - qChainKeyToComplexIdMap.clear(); - tChainKeyToComplexIdMap.clear(); - qComplexIdToChainKeyMap.clear(); - tComplexIdToChainKeyMap.clear(); - qcomplexIdToName.clear(); - tcomplexIdToName.clear(); - qComplexIdVec.clear(); - tComplexIdVec.clear(); - qComplexLength.clear(); - tComplexLength.clear(); - - return EXIT_SUCCESS; -} \ No newline at end of file From aa30bec590c807d9e33d5971e10338818a9db1db Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Fri, 16 Aug 2024 17:59:41 +0900 Subject: [PATCH 125/160] NogridInterface --- src/strucclustutils/filtermultimer.cpp | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 5411f2e6..a7d74b5d 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -263,8 +263,6 @@ class ComplexFilterCriteria { for (size_t chainIdx2 = chainIdx1+1; chainIdx2 < qAlnChains.size(); chainIdx2++) { AlignedCoordinate qChain1 = qAlnChains[chainIdx1]; AlignedCoordinate qChain2 = qAlnChains[chainIdx2]; - AlignedCoordinate tChain1 = tAlnChains[chainIdx1]; - AlignedCoordinate tChain2 = tAlnChains[chainIdx2]; for (size_t resIdx1 = 0; resIdx1 < qChain1.x.size(); resIdx1++) { for (size_t resIdx2 = 0; resIdx2 < qChain2.x.size(); resIdx2++) { float dist = BasicFunction::dist(qChain1.x[resIdx1], qChain1.y[resIdx1], qChain1.z[resIdx1], @@ -289,7 +287,6 @@ class ComplexFilterCriteria { } AlignedCoordinate qInterface(intLen); AlignedCoordinate tInterface(intLen); - size_t idx = 0; for (size_t chainIdx = 0; chainIdx < qInterfacePos.size(); chainIdx++) { for (size_t resIdx: qInterfacePos[chainIdx]) { @@ -388,9 +385,9 @@ void getComplexResidueLength( IndexReader *Dbr, std::vector &complexes) unsigned int cmpllen = 0; for (auto chainKey: chainKeys) { size_t id = Dbr->sequenceReader->getId(chainKey); - // Not accessible - if (id == NOT_AVAILABLE_CHAIN_KEY) - continue; + if (id == NOT_AVAILABLE_CHAIN_KEY){ + break; + } unsigned int reslen = Dbr->sequenceReader->getSeqLen(id); complex->chainLengths.push_back(reslen); cmpllen += Dbr->sequenceReader->getSeqLen(id); @@ -565,16 +562,15 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t data = Util::skipLine(data); unsigned int assId = retComplex.assId; unsigned int tChainKey = res.dbKey; - unsigned int tChainAlnId = alnDbr.getId(tChainKey); - //if target is monomer, break to be singleton - if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ - break; - } unsigned int tChainDbId = tDbr->sequenceReader->getId(tChainKey); unsigned int tComplexId = tChainKeyToComplexIdMap.at(tChainKey); unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); std::vector tChainKeys = tComplexes[tComplexIdx].chainKeys; - + //if target is monomer, break to be singleton + unsigned int tChainAlnId = alnDbr.getId(tChainKey); + if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ + break; + } float u[3][3]; float t[3]; fillUArr(retComplex.uString, u); From 56d3adbdbc6dcbbdfc62e2335fea6d0ed6484ebd Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Sun, 18 Aug 2024 00:06:30 +0900 Subject: [PATCH 126/160] monomer in scoremultimer --- src/commons/LocalParameters.cpp | 7 +-- src/commons/LocalParameters.h | 5 -- src/strucclustutils/filtermultimer.cpp | 67 +++++++------------------- src/strucclustutils/scoremultimer.cpp | 19 +++++--- src/workflow/MultimerCluster.cpp | 4 +- 5 files changed, 31 insertions(+), 71 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index 72f0ceee..8c7e2628 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -36,9 +36,8 @@ LocalParameters::LocalParameters() : PARAM_GPU(PARAM_GPU_ID, "--gpu", "Use GPU", "Use GPU (CUDA) if possible", typeid(int), (void *) &gpu, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON), PARAM_MULTIMER_TM_THRESHOLD(PARAM_MULTIMER_TM_THRESHOLD_ID,"--multimer-tm-threshold", "TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtMultimerTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "chain TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), - PARAM_INTERFACE_LDDT_THRESHOLD(PARAM_INTERFACE_LDDT_THRESHOLD_ID,"--interface-lddt-threshold", "Interface LDDT threshold", "accept alignments with a lddt > thr [0.0,1.0]",typeid(float), (void *) &filtInterfaceLddtThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), - PARAM_FILTER_MODE(PARAM_FILTER_MODE_ID, "--filter-mode", "Filter mode", "0: Interface\n1: Conformation\n2: loose", typeid(int), (void *) &filterMode, "[0-2]{0}$", MMseqsParameter::COMMAND_CLUST) - + PARAM_INTERFACE_LDDT_THRESHOLD(PARAM_INTERFACE_LDDT_THRESHOLD_ID,"--interface-lddt-threshold", "Interface LDDT threshold", "accept alignments with a lddt > thr [0.0,1.0]",typeid(float), (void *) &filtInterfaceLddtThr, "^0(\\.[0-9]+)?|1(\\.0+)?$") + { PARAM_ALIGNMENT_MODE.description = "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id"; PARAM_ALIGNMENT_MODE.regex = "^[0-3]{1}$"; @@ -198,7 +197,6 @@ LocalParameters::LocalParameters() : filtermultimer.push_back(&PARAM_MULTIMER_TM_THRESHOLD); filtermultimer.push_back(&PARAM_CHAIN_TM_THRESHOLD); filtermultimer.push_back(&PARAM_INTERFACE_LDDT_THRESHOLD); - filtermultimer.push_back(&PARAM_FILTER_MODE); // createmultimerreport createmultimerreport.push_back(&PARAM_DB_OUTPUT); @@ -273,7 +271,6 @@ LocalParameters::LocalParameters() : filtMultimerTmThr = 0.0; filtChainTmThr = 0.0; filtInterfaceLddtThr = 0.0; - filterMode = 0; prostt5Model = ""; gpu = 0; diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 43b37e1b..ae665375 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -139,10 +139,6 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_MULTIMER_TM_THRESHOLD) PARAMETER(PARAM_CHAIN_TM_THRESHOLD) PARAMETER(PARAM_INTERFACE_LDDT_THRESHOLD) - PARAMETER(PARAM_FILTER_MODE) - - - int prefMode; float tmScoreThr; @@ -169,7 +165,6 @@ class LocalParameters : public Parameters { float filtMultimerTmThr; float filtChainTmThr; float filtInterfaceLddtThr; - int filterMode; std::string prostt5Model; int gpu; diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index a7d74b5d..4040946c 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -2,7 +2,6 @@ #include "Util.h" #include "LocalParameters.h" #include "Matcher.h" -#include "Debug.h" #include "DBReader.h" #include "IndexReader.h" #include "FileUtil.h" @@ -105,7 +104,7 @@ class ComplexFilterCriteria { tAlnChains.clear(); } - bool hasTm(float TmThr, int covMode, int filterMode){ + bool hasTm(float TmThr, int covMode){ switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: return ((qTm>= TmThr) && (tTm >= TmThr)); @@ -118,35 +117,6 @@ class ComplexFilterCriteria { } } - // bool hasChainNum(int covMode, int filterMode, size_t qChainNum, size_t tChainNum ){ - // switch (filterMode){ - // case LocalParameters::FILTER_MODE_INTERFACE: - // switch (covMode) { - // case Parameters::COV_MODE_BIDIRECTIONAL: - // return (qAlnChainTms.size()==qChainNum && qChainNum==tChainNum); - // case Parameters::COV_MODE_TARGET: - // return (tAlnChainTms.size()==tChainNum); - // case Parameters::COV_MODE_QUERY: - // return (qAlnChainTms.size()==qChainNum); - // default: - // return true; - // } - // case LocalParameters::FILTER_MODE_CONFORMATION: - // switch (covMode) { - // case Parameters::COV_MODE_BIDIRECTIONAL: - // return (qChainNum==tChainNum); - // case Parameters::COV_MODE_TARGET: - // return (qChainNum>=tChainNum); - // case Parameters::COV_MODE_QUERY: - // return (qChainNum<=tChainNum); - // default: - // return true; - // } - // default: - // return true; - // } - // } - bool hasChainTm(float chainTmThr, int covMode, unsigned int qChainNum, unsigned int tChainNum) { if (qAlnChainTms.size()= iLddtThr) : true; - - // const bool conformationOK = isConformation(filterMode, chainTmThr); - // return (covOK && TmOK && chainNumOK && chainTmOK); return (covOK && TmOK && chainTmOK && lddtOK); } @@ -255,6 +221,9 @@ class ComplexFilterCriteria { } void computeInterfaceLddt(float threshold = 8) { + if(qAlnChains.size() == 1){ + interfaceLddt = 1; + } float t2 = threshold * threshold; std::vector> qInterfacePos(qAlnChains.size()); // chainIdx, resIdx unsigned int intLen = 0; @@ -537,13 +506,13 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int qChainAlnId = alnDbr.getId(qChainKey); unsigned int qChainDbId = qDbr->sequenceReader->getId(qChainKey); // Handling monomer as singleton - if (qChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ - char *outpos = Itoa::u32toa_sse2(qComplexId, buffer); - result.append(buffer, (outpos - buffer - 1)); - result.push_back('\n'); - result5.append(qComplex.complexName + "\t" + tComplexes[qComplexIdx].complexName + "\t1.000000\t1.000000\t1.000000\t1.000000\n"); - break; - } + // if (qChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ + // char *outpos = Itoa::u32toa_sse2(qComplexId, buffer); + // result.append(buffer, (outpos - buffer - 1)); + // result.push_back('\n'); + // result5.append(qComplex.complexName + "\t" + tComplexes[qComplexIdx].complexName + "\t1.000000\t1.000000\t1.000000\t1.000000\n"); + // break; + // } char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); @@ -568,9 +537,9 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t std::vector tChainKeys = tComplexes[tComplexIdx].chainKeys; //if target is monomer, break to be singleton unsigned int tChainAlnId = alnDbr.getId(tChainKey); - if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ - break; - } + // if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ + // continue; + // } float u[3][3]; float t[3]; fillUArr(retComplex.uString, u); @@ -614,7 +583,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } // Check if the criteria are met - if (!(cmplfiltcrit.satisfy(par.covMode, par.filterMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, par.filtInterfaceLddtThr, qComplex.nChain, tComplex.nChain))){ + if (!(cmplfiltcrit.satisfy(par.covMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, par.filtInterfaceLddtThr, qComplex.nChain, tComplex.nChain))){ continue; } @@ -644,7 +613,6 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); - // result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(cmplfiltcrit.qCov) + "\t" + std::to_string(cmplfiltcrit.tCov) + "\t"+ std::to_string(cmplfiltcrit.qTm)+"\t"+ std::to_string(cmplfiltcrit.tTm)+ "\n"); result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(cmplfiltcrit.qCov) + "\t" + std::to_string(cmplfiltcrit.tCov) + "\t"+ std::to_string(cmplfiltcrit.qTm)+"\t"+ std::to_string(cmplfiltcrit.tTm)+"\t"+ std::to_string(cmplfiltcrit.interfaceLddt)+"\n"); } @@ -682,6 +650,5 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t qComplexes.clear(); tComplexes.clear(); qComplexIdResult.clear(); - return EXIT_SUCCESS; } \ No newline at end of file diff --git a/src/strucclustutils/scoremultimer.cpp b/src/strucclustutils/scoremultimer.cpp index ca7c8105..75b2e197 100644 --- a/src/strucclustutils/scoremultimer.cpp +++ b/src/strucclustutils/scoremultimer.cpp @@ -1,14 +1,12 @@ #include "DBReader.h" #include "IndexReader.h" #include "DBWriter.h" -#include "Debug.h" #include "Util.h" #include "LocalParameters.h" -#include "Matcher.h" #include "StructureUtil.h" -#include "TMaligner.h" #include "Coordinate16.h" #include "MultimerUtil.h" +#include "Debug.h" #include "set" #ifdef OPENMP @@ -181,7 +179,8 @@ class DBSCANCluster { public: DBSCANCluster(SearchResult &searchResult, std::set &finalClusters, double minCov) : searchResult(searchResult), finalClusters(finalClusters) { cLabel = 0; - clusterSizeThr = std::max(MULTIPLE_CHAINED_COMPLEX, (unsigned int) ((double) searchResult.qChainKeys.size() * minCov)); + // clusterSizeThr = std::max(MULTIPLE_CHAINED_COMPLEX, (unsigned int) ((double) searchResult.qChainKeys.size() * minCov)); + clusterSizeThr = (unsigned int) ((double) searchResult.qChainKeys.size() * minCov); idealClusterSize = std::min(searchResult.qChainKeys.size(), searchResult.dbChainKeys.size()); prevMaxClusterSize = 0; maxDist = 0; @@ -508,8 +507,10 @@ class ComplexScorer { continue; } paredSearchResult.standardize(); - if (!paredSearchResult.alnVec.empty() && currDbChainKeys.size() >= MULTIPLE_CHAINED_COMPLEX) + // if (!paredSearchResult.alnVec.empty() && currDbChainKeys.size() >= MULTIPLE_CHAINED_COMPLEX) + if (!paredSearchResult.alnVec.empty()){ searchResults.emplace_back(paredSearchResult); + } paredSearchResult.alnVec.clear(); currDbComplexId = aln.dbChain.complexId; @@ -520,8 +521,10 @@ class ComplexScorer { } currAlns.clear(); paredSearchResult.standardize(); - if (!paredSearchResult.alnVec.empty() && currDbChainKeys.size() >= MULTIPLE_CHAINED_COMPLEX) + // if (!paredSearchResult.alnVec.empty() && currDbChainKeys.size() >= MULTIPLE_CHAINED_COMPLEX) + if (!paredSearchResult.alnVec.empty()){ searchResults.emplace_back(paredSearchResult); + } paredSearchResult.alnVec.clear(); } @@ -703,8 +706,8 @@ int scoremultimer(int argc, const char **argv, const Command &command) { for (size_t qCompIdx = 0; qCompIdx < qComplexIndices.size(); qCompIdx++) { unsigned int qComplexId = qComplexIndices[qCompIdx]; std::vector &qChainKeys = qComplexIdToChainKeysMap.at(qComplexId); - if (qChainKeys.size() < MULTIPLE_CHAINED_COMPLEX) - continue; + // if (qChainKeys.size() < MULTIPLE_CHAINED_COMPLEX) + // continue; complexScorer.getSearchResults(qComplexId, qChainKeys, dbChainKeyToComplexIdMap, dbComplexIdToChainKeysMap, searchResults); // for each db complex for (size_t dbId = 0; dbId < searchResults.size(); dbId++) { diff --git a/src/workflow/MultimerCluster.cpp b/src/workflow/MultimerCluster.cpp index 22d8582d..2fa5a3f4 100644 --- a/src/workflow/MultimerCluster.cpp +++ b/src/workflow/MultimerCluster.cpp @@ -3,16 +3,14 @@ #include "FileUtil.h" #include "CommandCaller.h" #include "Util.h" -#include "Debug.h" #include "LocalParameters.h" - +#include "Debug.h" #include "multimercluster.sh.h" void setMultimerClusterDefaults(LocalParameters *p) { p->filtMultimerTmThr = 0.5; // FIX // p->filtChainTmThr=0.0; // FIX // p->filtInterfaceLddtThr = 0.0; // FIX - p->filterMode=0; } int multimercluster(int argc, const char **argv, const Command &command) { From 71b1f38f78b284c66b83d6324c73185a43a605dd Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Sun, 18 Aug 2024 16:13:59 +0900 Subject: [PATCH 127/160] complex_h --- data/multimercluster.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/multimercluster.sh b/data/multimercluster.sh index ba7b5732..bf2f5b53 100644 --- a/data/multimercluster.sh +++ b/data/multimercluster.sh @@ -55,7 +55,7 @@ buldCmplhDb(){ split(words[1],parts,"_") output_string=parts[1] for (j = 2; j < length(parts); j++) { - if (j < length(parts)-1){ + if (j < length(parts)){ output_string=output_string"_" } output_string = output_string parts[j] From 9730f0594e5bacfd53fa803d88aa4a2aac80521d Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 28 Aug 2024 18:40:31 +0900 Subject: [PATCH 128/160] minor --- src/strucclustutils/filtermultimer.cpp | 27 +++++++++++++++----------- src/strucclustutils/scoremultimer.cpp | 5 +++++ 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 4040946c..61213db2 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -123,6 +123,9 @@ class ComplexFilterCriteria { } switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: + if (qChainNum != tChainNum){ + return false; + } for (size_t i = 0; i < qAlnChainTms.size(); i++) { if (qAlnChainTms[i] < chainTmThr || tAlnChainTms[i] < chainTmThr) { return false; @@ -154,7 +157,7 @@ class ComplexFilterCriteria { const bool TmOK = TmThr ? hasTm(TmThr, covMode) : true; const bool chainTmOK = chainTmThr ? hasChainTm(chainTmThr, covMode, qChainNum, tChainNum) : true; const bool lddtOK = iLddtThr ? (interfaceLddt >= iLddtThr) : true; - return (covOK && TmOK && chainTmOK && lddtOK); + return (covOK && TmOK && chainTmOK && lddtOK); } void updateAln(unsigned int qAlnLen, unsigned int tAlnLen) { @@ -258,15 +261,17 @@ class ComplexFilterCriteria { AlignedCoordinate tInterface(intLen); size_t idx = 0; for (size_t chainIdx = 0; chainIdx < qInterfacePos.size(); chainIdx++) { - for (size_t resIdx: qInterfacePos[chainIdx]) { - qInterface.x[idx] = qAlnChains[chainIdx].x[resIdx]; - qInterface.y[idx] = qAlnChains[chainIdx].y[resIdx]; - qInterface.z[idx] = qAlnChains[chainIdx].z[resIdx]; - tInterface.x[idx] = tAlnChains[chainIdx].x[resIdx]; - tInterface.y[idx] = tAlnChains[chainIdx].y[resIdx]; - tInterface.z[idx] = tAlnChains[chainIdx].z[resIdx]; - idx++; - } + // if (qInterfacePos[chainIdx].size() >= 4){ + for (size_t resIdx: qInterfacePos[chainIdx]) { + qInterface.x[idx] = qAlnChains[chainIdx].x[resIdx]; + qInterface.y[idx] = qAlnChains[chainIdx].y[resIdx]; + qInterface.z[idx] = qAlnChains[chainIdx].z[resIdx]; + tInterface.x[idx] = tAlnChains[chainIdx].x[resIdx]; + tInterface.y[idx] = tAlnChains[chainIdx].y[resIdx]; + tInterface.z[idx] = tAlnChains[chainIdx].z[resIdx]; + idx++; + } + // } } std::string bt(intLen, 'M'); LDDTCalculator *lddtcalculator = NULL; @@ -536,7 +541,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); std::vector tChainKeys = tComplexes[tComplexIdx].chainKeys; //if target is monomer, break to be singleton - unsigned int tChainAlnId = alnDbr.getId(tChainKey); + // unsigned int tChainAlnId = alnDbr.getId(tChainKey); // if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ // continue; // } diff --git a/src/strucclustutils/scoremultimer.cpp b/src/strucclustutils/scoremultimer.cpp index 98dbb321..a5fe8833 100644 --- a/src/strucclustutils/scoremultimer.cpp +++ b/src/strucclustutils/scoremultimer.cpp @@ -703,15 +703,18 @@ int scoremultimer(int argc, const char **argv, const Command &command) { #pragma omp for schedule(dynamic, 1) // for each q complex for (size_t qCompIdx = 0; qCompIdx < qComplexIndices.size(); qCompIdx++) { + Debug(Debug::WARNING)< &qChainKeys = qComplexIdToChainKeysMap.at(qComplexId); // if (qChainKeys.size() < MULTIPLE_CHAINED_COMPLEX) // continue; complexScorer.getSearchResults(qComplexId, qChainKeys, dbChainKeyToComplexIdMap, dbComplexIdToChainKeysMap, searchResults); + Debug(Debug::WARNING)<<"finished getSearchResults\n"; // for each db complex for (size_t dbId = 0; dbId < searchResults.size(); dbId++) { complexScorer.getAssignments(searchResults[dbId], assignments); } + Debug(Debug::WARNING)<<"finished getAssignments\n"; SORT_SERIAL(assignments.begin(), assignments.end(), compareAssignment); // for each query chain key for (size_t qChainKeyIdx = 0; qChainKeyIdx < qChainKeys.size(); qChainKeyIdx++) { @@ -729,11 +732,13 @@ int scoremultimer(int argc, const char **argv, const Command &command) { resultToWriteLines[currIdx].append(buffer); } } + Debug(Debug::WARNING)<<"finished for each assignment\n"; for (size_t qChainKeyIdx = 0; qChainKeyIdx < qChainKeys.size(); qChainKeyIdx++) { resultToWrite_t &resultToWrite = resultToWriteLines[qChainKeyIdx]; unsigned int & qKey = qChainKeys[qChainKeyIdx]; resultWriter.writeData(resultToWrite.c_str(),resultToWrite.length(),qKey,thread_idx); } + Debug(Debug::WARNING)<<"finished writing\n"; assignments.clear(); resultToWriteLines.clear(); searchResults.clear(); From 51197117e41552226d680da10c5d2214e82e616f Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Fri, 30 Aug 2024 05:18:06 +0900 Subject: [PATCH 129/160] minor --- src/strucclustutils/filtermultimer.cpp | 57 +++++++++++++++----------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 61213db2..3b277958 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -1,11 +1,9 @@ #include "DBWriter.h" #include "Util.h" #include "LocalParameters.h" -#include "Matcher.h" #include "DBReader.h" #include "IndexReader.h" #include "FileUtil.h" -#include "MemoryMapped.h" #include "Coordinate16.h" #include "tmalign/basic_fun.h" #include "MultimerUtil.h" @@ -104,7 +102,7 @@ class ComplexFilterCriteria { tAlnChains.clear(); } - bool hasTm(float TmThr, int covMode){ + bool hasTm(float TmThr, int covMode) { switch (covMode) { case Parameters::COV_MODE_BIDIRECTIONAL: return ((qTm>= TmThr) && (tTm >= TmThr)); @@ -118,14 +116,11 @@ class ComplexFilterCriteria { } bool hasChainTm(float chainTmThr, int covMode, unsigned int qChainNum, unsigned int tChainNum) { - if (qAlnChainTms.size()= iLddtThr) : true; - return (covOK && TmOK && chainTmOK && lddtOK); + return (covOK && TmOK && chainTmOK && lddtOK && chainNumOK); } void updateAln(unsigned int qAlnLen, unsigned int tAlnLen) { @@ -224,7 +233,7 @@ class ComplexFilterCriteria { } void computeInterfaceLddt(float threshold = 8) { - if(qAlnChains.size() == 1){ + if (qAlnChains.size() == 1) { interfaceLddt = 1; } float t2 = threshold * threshold; @@ -261,7 +270,7 @@ class ComplexFilterCriteria { AlignedCoordinate tInterface(intLen); size_t idx = 0; for (size_t chainIdx = 0; chainIdx < qInterfacePos.size(); chainIdx++) { - // if (qInterfacePos[chainIdx].size() >= 4){ + if (qInterfacePos[chainIdx].size() >= 4) { for (size_t resIdx: qInterfacePos[chainIdx]) { qInterface.x[idx] = qAlnChains[chainIdx].x[resIdx]; qInterface.y[idx] = qAlnChains[chainIdx].y[resIdx]; @@ -271,7 +280,7 @@ class ComplexFilterCriteria { tInterface.z[idx] = tAlnChains[chainIdx].z[resIdx]; idx++; } - // } + } } std::string bt(intLen, 'M'); LDDTCalculator *lddtcalculator = NULL; @@ -322,7 +331,7 @@ void fillTArr(const std::string &tString, float (&t)[3]) { } } -unsigned int cigarToAlignedLength(const std::string &cigar){ +unsigned int cigarToAlignedLength(const std::string &cigar) { std::string backtrace = Matcher::uncompressAlignment(cigar); unsigned int alni = 0; for (size_t btPos = 0; btPos < backtrace.size(); btPos++) { @@ -359,7 +368,7 @@ void getComplexResidueLength( IndexReader *Dbr, std::vector &complexes) unsigned int cmpllen = 0; for (auto chainKey: chainKeys) { size_t id = Dbr->sequenceReader->getId(chainKey); - if (id == NOT_AVAILABLE_CHAIN_KEY){ + if (id == NOT_AVAILABLE_CHAIN_KEY) { break; } unsigned int reslen = Dbr->sequenceReader->getSeqLen(id); @@ -511,7 +520,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int qChainAlnId = alnDbr.getId(qChainKey); unsigned int qChainDbId = qDbr->sequenceReader->getId(qChainKey); // Handling monomer as singleton - // if (qChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ + // if (qChainAlnId == NOT_AVAILABLE_CHAIN_KEY) { // char *outpos = Itoa::u32toa_sse2(qComplexId, buffer); // result.append(buffer, (outpos - buffer - 1)); // result.push_back('\n'); @@ -528,7 +537,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t while (*data != '\0' ) { ComplexDataHandler retComplex = parseScoreComplexResult(data, res); - if (!retComplex.isValid){ + if (!retComplex.isValid) { Debug(Debug::ERROR) << "No scorecomplex result provided"; EXIT(EXIT_FAILURE); } @@ -542,7 +551,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t std::vector tChainKeys = tComplexes[tComplexIdx].chainKeys; //if target is monomer, break to be singleton // unsigned int tChainAlnId = alnDbr.getId(tChainKey); - // if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY){ + // if (tChainAlnId == NOT_AVAILABLE_CHAIN_KEY) { // continue; // } float u[3][3]; @@ -557,7 +566,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); cmplfiltcrit.updateAln(qalnlen, talnlen); - + // save Aligned coordinatese if needed : chainTmThr & lddtThr if (par.filtChainTmThr > 0.0f || par.filtInterfaceLddtThr > 0.0f) { char *tcadata = tStructDbr->getData(tChainDbId, thread_idx); @@ -575,7 +584,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } // Filter the target complexes and get the best alignment - for (auto& assId_res : localComplexMap){ + for (auto& assId_res : localComplexMap) { unsigned int tComplexId = assId_res.second.targetComplexId; unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes[tComplexIdx]; @@ -588,27 +597,27 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } // Check if the criteria are met - if (!(cmplfiltcrit.satisfy(par.covMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, par.filtInterfaceLddtThr, qComplex.nChain, tComplex.nChain))){ + if (!(cmplfiltcrit.satisfy(par.covMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, par.filtInterfaceLddtThr, qComplex.nChain, tComplex.nChain))) { continue; } unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); // Get the best alignement per each target complex - if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()){ + if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()) { cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } else { - if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]){ + if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]) { cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } } } - for (const auto& pair : cmplIdToBestAssId){ + for (const auto& pair : cmplIdToBestAssId) { selectedAssIDs.push_back(pair.second[0]); } - for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++){ + for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++) { unsigned int assId = selectedAssIDs[assIdidx]; ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); unsigned int tComplexId = cmplfiltcrit.targetComplexId; @@ -637,7 +646,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } } // MP end - for (auto &pair : qComplexIdResult){ + for (auto &pair : qComplexIdResult) { resultWriter.writeData(pair.second.c_str(), pair.second.length(), pair.first); } From 05a80c5869df19d2e181f30f6e91768445a3366f Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Fri, 30 Aug 2024 19:06:35 +0900 Subject: [PATCH 130/160] filtcov.tsv to complex_filt_info file. createtsv query query complex_filt_info filtcov.tsv possible --- data/multimercluster.sh | 2 +- src/strucclustutils/filtermultimer.cpp | 112 +++++++++++++++++-------- 2 files changed, 80 insertions(+), 34 deletions(-) diff --git a/data/multimercluster.sh b/data/multimercluster.sh index bf2f5b53..bb646b19 100644 --- a/data/multimercluster.sh +++ b/data/multimercluster.sh @@ -84,7 +84,7 @@ fi if notExists "complex_filt.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" filtermultimer "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" "${TMP_PATH}/filtcov.tsv" ${FILTERMULTIMER_PAR} \ + "$MMSEQS" filtermultimer "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" "${TMP_PATH}/complex_filt_info" ${FILTERMULTIMER_PAR} \ || fail "FilterMultimer died" fi diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 3b277958..10aed6c9 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -73,6 +73,7 @@ class ComplexFilterCriteria { float interfaceLddt; double qTm; double tTm; + double avgTm; float t[3]; float u[3][3]; @@ -160,12 +161,28 @@ class ComplexFilterCriteria { return true; } + void calculateAvgTm(int covMode){ + switch (covMode) { + case Parameters::COV_MODE_BIDIRECTIONAL: + avgTm = ( qTm + tTm ) / 2 ; + break; + case Parameters::COV_MODE_TARGET: + avgTm = tTm ; + break; + case Parameters::COV_MODE_QUERY: + avgTm = qTm ; + break; + default : + avgTm = ( qTm + tTm ) / 2 ; + } + } bool satisfy(int covMode, float covThr, float TmThr, float chainTmThr, float iLddtThr, size_t qChainNum, size_t tChainNum ) { const bool covOK = covThr ? Util::hasCoverage(covThr, covMode, qCov, tCov) : true; const bool TmOK = TmThr ? hasTm(TmThr, covMode) : true; const bool chainTmOK = chainTmThr ? hasChainTm(chainTmThr, covMode, qChainNum, tChainNum) : true; const bool chainNumOK = hasChainNum(covMode, qChainNum, tChainNum); const bool lddtOK = iLddtThr ? (interfaceLddt >= iLddtThr) : true; + calculateAvgTm(covMode); return (covOK && TmOK && chainTmOK && lddtOK && chainNumOK); } @@ -292,6 +309,46 @@ class ComplexFilterCriteria { } }; + +char* filterToBuffer(ComplexFilterCriteria cmplfiltcrit, char* tmpBuff){ + *(tmpBuff-1) = '\t'; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.qCov, tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.tCov, tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.qTm, tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.tTm, tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.interfaceLddt, tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.t[0], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.t[1], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.t[2], tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[0][0], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[0][1], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[0][2], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[1][0], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[1][1], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[1][2], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[2][0], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[2][1], tmpBuff); + *(tmpBuff-1) = ','; + tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[2][2], tmpBuff); + *(tmpBuff-1) = '\n'; + return tmpBuff; +} + void fillUArr(const std::string &uString, float (&u)[3][3]) { std::string tmp; int i = 0; @@ -455,19 +512,15 @@ int filtermultimer(int argc, const char **argv, const Command &command) { DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX| DBReader::USE_DATA); alnDbr.open(DBReader::LINEAR_ACCCESS); size_t localThreads = 1; - - // Debug(Debug::WARNING) << "Monomer will be treated as singleton\nMonomer chain key: \n"; #ifdef OPENMP localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); #endif const bool shouldCompress = (par.compressed == true); const int db4Type = Parameters::DBTYPE_CLUSTER_RES; - - DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), 1, shouldCompress, db4Type); + DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), par.threads, shouldCompress, db4Type); resultWriter.open(); - const int db5Type = Parameters::DBTYPE_GENERIC_DB; - DBWriter resultWrite5(par.db5.c_str(), par.db5Index.c_str(), 1, shouldCompress, db5Type); + DBWriter resultWrite5(par.db5.c_str(), par.db5Index.c_str(), par.threads, shouldCompress, db5Type); resultWrite5.open(); std::string qLookupFile = par.db1 + ".lookup"; @@ -480,7 +533,6 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t getlookupInfo(qLookupFile, qChainKeyToComplexIdMap, qComplexes, qComplexIdToIdx); getComplexResidueLength(qDbr, qComplexes); Debug::Progress progress(qComplexes.size()); - std::map qComplexIdResult; if (sameDB) { tChainKeyToComplexIdMap = qChainKeyToComplexIdMap; @@ -493,15 +545,15 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t #pragma omp parallel num_threads(localThreads) { - resultToWrite_t result5; char buffer[32]; + char buffer2[4096]; unsigned int thread_idx = 0; #ifdef OPENMP thread_idx = static_cast(omp_get_thread_num()); #endif resultToWrite_t result; std::map localComplexMap; - std::map> cmplIdToBestAssId; // cmplId : [assId, alnSum] + std::map> cmplIdToBestAssId; std::vector selectedAssIDs; Coordinate16 qcoords; Coordinate16 tcoords; @@ -510,7 +562,6 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t #pragma omp for schedule(dynamic, 1) for (size_t qComplexIdx = 0; qComplexIdx < qComplexes.size(); qComplexIdx++) { progress.updateProgress(); - Complex qComplex = qComplexes[qComplexIdx]; unsigned int qComplexId = qComplex.complexId; std::vector qChainKeys = qComplex.chainKeys; @@ -561,7 +612,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t unsigned int qalnlen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); unsigned int talnlen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); if (localComplexMap.find(assId) == localComplexMap.end()) { - ComplexFilterCriteria cmplfiltcrit = ComplexFilterCriteria(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); + ComplexFilterCriteria cmplfiltcrit(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); localComplexMap[assId] = cmplfiltcrit; } ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); @@ -600,15 +651,20 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t if (!(cmplfiltcrit.satisfy(par.covMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, par.filtInterfaceLddtThr, qComplex.nChain, tComplex.nChain))) { continue; } - - unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); + // TODO: trying to look into the results to make sure what makes more sense + // unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); + // Get the best alignement per each target complex if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()) { - cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; } else { - if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]) { - cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + // if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]) { + // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + // } + if (cmplfiltcrit.avgTm > cmplIdToBestAssId.at(tComplexId)[1]) { + cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; } } } @@ -616,7 +672,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t for (const auto& pair : cmplIdToBestAssId) { selectedAssIDs.push_back(pair.second[0]); } - + resultWrite5.writeStart(thread_idx); for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++) { unsigned int assId = selectedAssIDs[assIdidx]; ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); @@ -627,31 +683,22 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); result.append(buffer, (outpos - buffer - 1)); result.push_back('\n'); - result5.append(qComplex.complexName + "\t" + tComplex.complexName + "\t" + std::to_string(cmplfiltcrit.qCov) + "\t" + std::to_string(cmplfiltcrit.tCov) + "\t"+ std::to_string(cmplfiltcrit.qTm)+"\t"+ std::to_string(cmplfiltcrit.tTm)+"\t"+ std::to_string(cmplfiltcrit.interfaceLddt)+"\n"); - } - #pragma omp critical - { - qComplexIdResult[qComplexId]= result; + char * tmpBuff = Itoa::u32toa_sse2(tComplexId, buffer2); + tmpBuff = filterToBuffer(cmplfiltcrit, tmpBuff); + resultWrite5.writeAdd(buffer2, tmpBuff - buffer2, thread_idx); } + resultWriter.writeData(result.c_str(), result.length(), qComplexId); + resultWrite5.writeEnd(qComplexId, thread_idx); result.clear(); localComplexMap.clear(); cmplIdToBestAssId.clear(); selectedAssIDs.clear(); } // for end - #pragma omp critical - { - resultWrite5.writeData(result5.c_str(), result5.length(), 0); - result5.clear(); - } } // MP end - - for (auto &pair : qComplexIdResult) { - resultWriter.writeData(pair.second.c_str(), pair.second.length(), pair.first); - } resultWriter.close(true); - resultWrite5.close(par.dbOut == false); + resultWrite5.close(true); qStructDbr.close(); alnDbr.close(); delete qDbr; @@ -663,6 +710,5 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t tChainKeyToComplexIdMap.clear(); qComplexes.clear(); tComplexes.clear(); - qComplexIdResult.clear(); return EXIT_SUCCESS; } \ No newline at end of file From aeacb68b91393dc2d842e5dd95561c887d0fae7a Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Fri, 30 Aug 2024 20:29:06 +0900 Subject: [PATCH 131/160] changed way to buffer for ustring, tstring --- src/strucclustutils/MultimerUtil.h | 24 +++++++++++++++++ src/strucclustutils/filtermultimer.cpp | 36 +++++++++++++------------- 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/src/strucclustutils/MultimerUtil.h b/src/strucclustutils/MultimerUtil.h index 6d8c6664..06628b4a 100644 --- a/src/strucclustutils/MultimerUtil.h +++ b/src/strucclustutils/MultimerUtil.h @@ -229,4 +229,28 @@ static ComplexDataHandler parseScoreComplexResult(const char *data, Matcher::res return {assId, qTmScore, tTmScore, uString, tString, true}; } +static char* fastfloatToBuffer(float value, char* buffer) { + if (value < 0) { + value *= -1; + *(buffer) = '-'; + buffer++; + } + int value1 = (int)(value); + buffer = Itoa::i32toa_sse2(value1, buffer); + *(buffer) = '.'; + buffer++; + + double value2 = value - value1; + if (value2 < 0.1){ + *(buffer) = '0'; + buffer++; + } + if (value2 < 0.01){ + *(buffer) = '0'; + buffer++; + } + buffer = Itoa::i32toa_sse2((int)(value2 * 1000), buffer); + return buffer; +} + #endif //FOLDSEEK_MULTIMERUTIL_H \ No newline at end of file diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 10aed6c9..4d1e6a7a 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -312,39 +312,39 @@ class ComplexFilterCriteria { char* filterToBuffer(ComplexFilterCriteria cmplfiltcrit, char* tmpBuff){ *(tmpBuff-1) = '\t'; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.qCov, tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.qCov, tmpBuff); *(tmpBuff-1) = '\t'; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.tCov, tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.tCov, tmpBuff); *(tmpBuff-1) = '\t'; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.qTm, tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.qTm, tmpBuff); *(tmpBuff-1) = '\t'; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.tTm, tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.tTm, tmpBuff); *(tmpBuff-1) = '\t'; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.interfaceLddt, tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.interfaceLddt, tmpBuff); *(tmpBuff-1) = '\t'; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.t[0], tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[0][0], tmpBuff); *(tmpBuff-1) = ','; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.t[1], tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[0][1], tmpBuff); *(tmpBuff-1) = ','; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.t[2], tmpBuff); - *(tmpBuff-1) = '\t'; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[0][0], tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[0][2], tmpBuff); *(tmpBuff-1) = ','; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[0][1], tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[1][0], tmpBuff); *(tmpBuff-1) = ','; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[0][2], tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[1][1], tmpBuff); *(tmpBuff-1) = ','; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[1][0], tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[1][2], tmpBuff); *(tmpBuff-1) = ','; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[1][1], tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[2][0], tmpBuff); *(tmpBuff-1) = ','; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[1][2], tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[2][1], tmpBuff); *(tmpBuff-1) = ','; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[2][0], tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.u[2][2], tmpBuff); + *(tmpBuff-1) = '\t'; + tmpBuff = fastfloatToBuffer(cmplfiltcrit.t[0], tmpBuff); *(tmpBuff-1) = ','; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[2][1], tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.t[1], tmpBuff); *(tmpBuff-1) = ','; - tmpBuff = Util::fastSeqIdToBuffer(cmplfiltcrit.u[2][2], tmpBuff); + tmpBuff = fastfloatToBuffer(cmplfiltcrit.t[2], tmpBuff); *(tmpBuff-1) = '\n'; return tmpBuff; } From 4bbdca4f6e1d17c2d86bdb5b7e9a714456617af5 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Sun, 1 Sep 2024 23:33:55 +0900 Subject: [PATCH 132/160] minor --- src/strucclustutils/scoremultimer.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/strucclustutils/scoremultimer.cpp b/src/strucclustutils/scoremultimer.cpp index a5fe8833..98dbb321 100644 --- a/src/strucclustutils/scoremultimer.cpp +++ b/src/strucclustutils/scoremultimer.cpp @@ -703,18 +703,15 @@ int scoremultimer(int argc, const char **argv, const Command &command) { #pragma omp for schedule(dynamic, 1) // for each q complex for (size_t qCompIdx = 0; qCompIdx < qComplexIndices.size(); qCompIdx++) { - Debug(Debug::WARNING)< &qChainKeys = qComplexIdToChainKeysMap.at(qComplexId); // if (qChainKeys.size() < MULTIPLE_CHAINED_COMPLEX) // continue; complexScorer.getSearchResults(qComplexId, qChainKeys, dbChainKeyToComplexIdMap, dbComplexIdToChainKeysMap, searchResults); - Debug(Debug::WARNING)<<"finished getSearchResults\n"; // for each db complex for (size_t dbId = 0; dbId < searchResults.size(); dbId++) { complexScorer.getAssignments(searchResults[dbId], assignments); } - Debug(Debug::WARNING)<<"finished getAssignments\n"; SORT_SERIAL(assignments.begin(), assignments.end(), compareAssignment); // for each query chain key for (size_t qChainKeyIdx = 0; qChainKeyIdx < qChainKeys.size(); qChainKeyIdx++) { @@ -732,13 +729,11 @@ int scoremultimer(int argc, const char **argv, const Command &command) { resultToWriteLines[currIdx].append(buffer); } } - Debug(Debug::WARNING)<<"finished for each assignment\n"; for (size_t qChainKeyIdx = 0; qChainKeyIdx < qChainKeys.size(); qChainKeyIdx++) { resultToWrite_t &resultToWrite = resultToWriteLines[qChainKeyIdx]; unsigned int & qKey = qChainKeys[qChainKeyIdx]; resultWriter.writeData(resultToWrite.c_str(),resultToWrite.length(),qKey,thread_idx); } - Debug(Debug::WARNING)<<"finished writing\n"; assignments.clear(); resultToWriteLines.clear(); searchResults.clear(); From a6bee293abab7ee727ac3ac14d00b0b90b339292 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 2 Sep 2024 01:35:08 +0900 Subject: [PATCH 133/160] important issue solved, thread_idx while writing --- src/strucclustutils/filtermultimer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 4d1e6a7a..1ce97bee 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -688,7 +688,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t tmpBuff = filterToBuffer(cmplfiltcrit, tmpBuff); resultWrite5.writeAdd(buffer2, tmpBuff - buffer2, thread_idx); } - resultWriter.writeData(result.c_str(), result.length(), qComplexId); + resultWriter.writeData(result.c_str(), result.length(), qComplexId, thread_idx); resultWrite5.writeEnd(qComplexId, thread_idx); result.clear(); localComplexMap.clear(); From 6db582eac469e2fde79a8c5e676f749d42947212 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 2 Sep 2024 03:03:59 +0900 Subject: [PATCH 134/160] made filtermultimer to get one argument as output. 'output' and 'outout_info' will be the actual outputs --- data/multimercluster.sh | 2 +- src/FoldseekBase.cpp | 3 +-- src/strucclustutils/filtermultimer.cpp | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/data/multimercluster.sh b/data/multimercluster.sh index bb646b19..8b0cb558 100644 --- a/data/multimercluster.sh +++ b/data/multimercluster.sh @@ -84,7 +84,7 @@ fi if notExists "complex_filt.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" filtermultimer "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" "${TMP_PATH}/complex_filt_info" ${FILTERMULTIMER_PAR} \ + "$MMSEQS" filtermultimer "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" ${FILTERMULTIMER_PAR} \ || fail "FilterMultimer died" fi diff --git a/src/FoldseekBase.cpp b/src/FoldseekBase.cpp index f6745043..e8429d8a 100644 --- a/src/FoldseekBase.cpp +++ b/src/FoldseekBase.cpp @@ -286,8 +286,7 @@ std::vector foldseekCommands = { {"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, - {"clustDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb }, - {"tmptsv", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::flatfile } + {"clustDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &FoldSeekDbValidator::clusterDb } } }, {"multimercluster", multimercluster, &localPar.multimerclusterworkflow, COMMAND_MAIN, diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 1ce97bee..1861a682 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -520,7 +520,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), par.threads, shouldCompress, db4Type); resultWriter.open(); const int db5Type = Parameters::DBTYPE_GENERIC_DB; - DBWriter resultWrite5(par.db5.c_str(), par.db5Index.c_str(), par.threads, shouldCompress, db5Type); + DBWriter resultWrite5((par.db4 + "_info").c_str(), (par.db4 + "_info.index").c_str(), par.threads, shouldCompress, db5Type); resultWrite5.open(); std::string qLookupFile = par.db1 + ".lookup"; @@ -575,7 +575,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t // char *outpos = Itoa::u32toa_sse2(qComplexId, buffer); // result.append(buffer, (outpos - buffer - 1)); // result.push_back('\n'); - // result5.append(qComplex.complexName + "\t" + tComplexes[qComplexIdx].complexName + "\t1.000000\t1.000000\t1.000000\t1.000000\n"); + // resultWriter.writeData(result.c_str(), result.length(), qComplexId, thread_idx); // break; // } From 72f5028c7438da2c0d3f50632aefe5de622c8c99 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 4 Sep 2024 17:53:03 +0900 Subject: [PATCH 135/160] map to vector, complex to multimer, [TODO] check if speed improved --- data/easymultimercluster.sh | 44 +++++------ data/multimercluster.sh | 29 +++---- src/strucclustutils/filtermultimer.cpp | 104 +++++++++++++++++-------- 3 files changed, 108 insertions(+), 69 deletions(-) diff --git a/data/easymultimercluster.sh b/data/easymultimercluster.sh index 63ca71fd..810e6d59 100644 --- a/data/easymultimercluster.sh +++ b/data/easymultimercluster.sh @@ -86,57 +86,57 @@ if notExists "${TMP_PATH}/query.dbtype"; then || fail "query createdb died" fi -if notExists "${TMP_PATH}/complex_clu.dbtype"; then +if notExists "${TMP_PATH}/multimer_clu.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" multimercluster "${TMP_PATH}/query" "${TMP_PATH}/complex_clu" "${TMP_PATH}" ${MULTIMERCLUSTER_PAR} \ + "$MMSEQS" multimercluster "${TMP_PATH}/query" "${TMP_PATH}/multimer_clu" "${TMP_PATH}" ${MULTIMERCLUSTER_PAR} \ || fail "Multimercluster died" fi SOURCE="${TMP_PATH}/query" -INPUT="${TMP_PATH}/latest/complex_db" +INPUT="${TMP_PATH}/latest/multimer_db" if notExists "${TMP_PATH}/cluster.tsv"; then # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ || fail "Convert Alignments died" fi -if notExists "${TMP_PATH}/complex_rep_seqs.dbtype"; then +if notExists "${TMP_PATH}/multimer_rep_seqs.dbtype"; then mapCmplName2ChainKeys "${TMP_PATH}/cluster.tsv" "${SOURCE}" "${TMP_PATH}/rep_seqs.list" # shellcheck disable=SC2086 - "$MMSEQS" createsubdb "${TMP_PATH}/rep_seqs.list" "${SOURCE}" "${TMP_PATH}/complex_rep_seqs" ${CREATESUBDB_PAR} \ + "$MMSEQS" createsubdb "${TMP_PATH}/rep_seqs.list" "${SOURCE}" "${TMP_PATH}/multimer_rep_seqs" ${CREATESUBDB_PAR} \ || fail "createsubdb died" fi -if notExists "${TMP_PATH}/complex_rep_seq.fasta"; then +if notExists "${TMP_PATH}/multimer_rep_seq.fasta"; then # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${SOURCE}" "${SOURCE}" "${TMP_PATH}/complex_rep_seqs" "${TMP_PATH}/complex_rep_seq.fasta" ${VERBOSITY_PAR} \ + "$MMSEQS" result2flat "${SOURCE}" "${SOURCE}" "${TMP_PATH}/multimer_rep_seqs" "${TMP_PATH}/multimer_rep_seq.fasta" ${VERBOSITY_PAR} \ || fail "result2flat died" - postprocessFasta "${TMP_PATH}/complex_rep_seq.fasta" + postprocessFasta "${TMP_PATH}/multimer_rep_seq.fasta" fi #TODO: generate fasta file for all sequences -# if notExists "${TMP_PATH}/complex_all_seqs.fasta"; then -# # shellcheck disable=SC2086 -# "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/complex_clust" "${TMP_PATH}/complex_clust_seqs" ${THREADS_PAR} \ -# || fail "Result2repseq died" +if notExists "${TMP_PATH}/multimer_all_seqs.fasta"; then + # shellcheck disable=SC2086 + "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/multimer_clu" "${TMP_PATH}/multimer_clust_seqs" ${THREADS_PAR} \ + || fail "Result2repseq died" -# # shellcheck disable=SC2086 -# "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_clust_seqs" "${TMP_PATH}/complex_all_seqs.fasta" ${VERBOSITY_PAR} \ -# || fail "result2flat died" -# fi + # shellcheck disable=SC2086 + "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clust_seqs" "${TMP_PATH}/multimer_all_seqs.fasta" ${VERBOSITY_PAR} \ + || fail "result2flat died" +fi -# mv "${TMP_PATH}/complex_all_seqs.fasta" "${RESULT}_all_seqs.fasta" -mv "${TMP_PATH}/complex_rep_seq.fasta" "${RESULT}_rep_seq.fasta" +mv "${TMP_PATH}/multimer_all_seqs.fasta" "${RESULT}_all_seqs.fasta" +mv "${TMP_PATH}/multimer_rep_seq.fasta" "${RESULT}_rep_seq.fasta" mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" if [ -n "${REMOVE_TMP}" ]; then rm "${INPUT}.0" # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_db" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/multimer_db" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - # "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_seqs" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_seqs" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_rep_seqs" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/multimer_rep_seqs" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_rep_seqs_h" ${VERBOSITY_PAR} # shellcheck disable=SC2086 diff --git a/data/multimercluster.sh b/data/multimercluster.sh index 8b0cb558..31a553b8 100644 --- a/data/multimercluster.sh +++ b/data/multimercluster.sh @@ -76,26 +76,26 @@ buldCmplhDb(){ # [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}"; -if notExists "${TMP_PATH}/complex_result.dbtype"; then +if notExists "${TMP_PATH}/multimer_result.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" multimersearch "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/multimersearch_tmp" ${MULTIMERSEARCH_PAR} \ + "$MMSEQS" multimersearch "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_result" "${TMP_PATH}/multimersearch_tmp" ${MULTIMERSEARCH_PAR} \ || fail "multimerSearch died" fi -if notExists "complex_filt.dbtype"; then +if notExists "multimer_filt.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" filtermultimer "${INPUT}" "${INPUT}" "${TMP_PATH}/complex_result" "${TMP_PATH}/complex_filt" ${FILTERMULTIMER_PAR} \ + "$MMSEQS" filtermultimer "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_result" "${TMP_PATH}/multimer_filt" ${FILTERMULTIMER_PAR} \ || fail "FilterMultimer died" fi # shift query DB, .index, .dbtype -if notExists "${TMP_PATH}/complex_db.dbtype"; then +if notExists "${TMP_PATH}/multimer_db.dbtype"; then # build complex db as output - buildCmplDb "${INPUT}" "${TMP_PATH}/complex_db" + buildCmplDb "${INPUT}" "${TMP_PATH}/multimer_db" fi # Shift _h, _h.dbtype -if notExists "${TMP_PATH}/complex_db_h.dbtype"; then +if notExists "${TMP_PATH}/multimer_db_h.dbtype"; then # # shellcheck disable=SC2086 # "$MMSEQS" tsv2db "${INPUT}.source" "${TMP_PATH}/complex_db_header_tmp" ${VERBOSITY_PAR} \ # || fail "tsv2db died" @@ -103,26 +103,27 @@ if notExists "${TMP_PATH}/complex_db_h.dbtype"; then # "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h.tsv" ${VERBOSITY_PAR} \ "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h.tsv" --threads 1 \ || fail "createtsv died" - buldCmplhDb "${TMP_PATH}/chain_db_h.tsv" "${TMP_PATH}/complex_header.tsv" + buldCmplhDb "${TMP_PATH}/chain_db_h.tsv" "${TMP_PATH}/multimer_header.tsv" # shellcheck disable=SC2086 - "$MMSEQS" tsv2db "${TMP_PATH}/complex_header.tsv" "${TMP_PATH}/complex_db_h" ${VERBOSITY_PAR} \ + "$MMSEQS" tsv2db "${TMP_PATH}/multimer_header.tsv" "${TMP_PATH}/multimer_db_h" ${VERBOSITY_PAR} \ || fail "tsv2db died" fi -COMP="${TMP_PATH}/complex_db" +COMP="${TMP_PATH}/multimer_db" if notExists "${RESULT}.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" clust "${COMP}" "${TMP_PATH}/complex_filt" "${RESULT}" ${CLUSTER_PAR} \ + "$MMSEQS" clust "${COMP}" "${TMP_PATH}/multimer_filt" "${RESULT}" ${CLUSTER_PAR} \ || fail "Clustering died" fi if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_filt" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/multimer_filt" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_result" ${VERBOSITY_PAR} - rm "${TMP_PATH}/complex_header.tsv" + "$MMSEQS" rmdb "${TMP_PATH}/multimer_result" ${VERBOSITY_PAR} + rm "${TMP_PATH}/chain_db_h.tsv" + rm "${TMP_PATH}/multimer_header.tsv" rm -rf "${TMP_PATH}/multimersearch_tmp" rm -f "${TMP_PATH}/multimercluster.sh" fi \ No newline at end of file diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 1861a682..1938e57b 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -552,8 +552,12 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t thread_idx = static_cast(omp_get_thread_num()); #endif resultToWrite_t result; - std::map localComplexMap; - std::map> cmplIdToBestAssId; + //As scoremultimer prints out after sorting assignment id, map can be replaced with vector. + // std::map localComplexMap; + std::vector< ComplexFilterCriteria> localComplexVector; + // std::map> cmplIdToBestAssId; + std::vector cmpltargetIds; + std::vector targetIdBestTm; std::vector selectedAssIDs; Coordinate16 qcoords; Coordinate16 tcoords; @@ -578,7 +582,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t // resultWriter.writeData(result.c_str(), result.length(), qComplexId, thread_idx); // break; // } - + char *qcadata = qStructDbr.getData(qChainDbId, thread_idx); size_t qCaLength = qStructDbr.getEntryLen(qChainDbId); size_t qChainLen = qDbr->sequenceReader->getSeqLen(qChainDbId); @@ -611,11 +615,20 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t fillTArr(retComplex.tString, t); unsigned int qalnlen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); unsigned int talnlen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - if (localComplexMap.find(assId) == localComplexMap.end()) { + if (localComplexVector.size() <= assId) { ComplexFilterCriteria cmplfiltcrit(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); - localComplexMap[assId] = cmplfiltcrit; + size_t subt = assId - localComplexVector.size(); + for (size_t sub=0; sub < subt; sub ++) { + localComplexVector.push_back(cmplfiltcrit); + } + localComplexVector.push_back(cmplfiltcrit); } - ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); + // if (localComplexMap.find(assId) == localComplexMap.end()) { + // ComplexFilterCriteria cmplfiltcrit(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); + // localComplexMap[assId] = cmplfiltcrit; + // } + // ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); + ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); cmplfiltcrit.updateAln(qalnlen, talnlen); // save Aligned coordinatese if needed : chainTmThr & lddtThr @@ -635,12 +648,16 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } // Filter the target complexes and get the best alignment - for (auto& assId_res : localComplexMap) { - unsigned int tComplexId = assId_res.second.targetComplexId; + for (unsigned int assId = 0; assId < localComplexVector.size(); assId++) { + // for (auto& assId_res : localComplexMap) { + // unsigned int tComplexId = assId_res.second.targetComplexId; + unsigned int tComplexId = localComplexVector.at(assId).targetComplexId; + unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes[tComplexIdx]; - ComplexFilterCriteria &cmplfiltcrit = assId_res.second; + // ComplexFilterCriteria &cmplfiltcrit = assId_res.second; + ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); cmplfiltcrit.calcCov(qComplex.complexLength, tComplex.complexLength); if (par.filtInterfaceLddtThr > 0.0) { @@ -654,31 +671,49 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t // TODO: trying to look into the results to make sure what makes more sense // unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); - // Get the best alignement per each target complex - if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()) { - // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; - cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; - } - else { - // if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]) { - // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; - // } - if (cmplfiltcrit.avgTm > cmplIdToBestAssId.at(tComplexId)[1]) { - cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; + // Get the best alignement per each target complex + // if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()) { + // // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + // // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; + // cmplIdToBestAssId[tComplexId] = {static_cast(assId), cmplfiltcrit.avgTm}; + // } + // else { + // // if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]) { + // // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + // // } + // if (cmplfiltcrit.avgTm > cmplIdToBestAssId.at(tComplexId)[1]) { + // // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; + // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res), cmplfiltcrit.avgTm}; + // } + // } + + unsigned int targetindex; + auto it = std::find(cmpltargetIds.begin(), cmpltargetIds.end(), tComplexId); + if ( it == cmpltargetIds.end()) { + cmpltargetIds.push_back(tComplexId); + selectedAssIDs.push_back(assId); + targetIdBestTm.push_back(cmplfiltcrit.avgTm); + } else { + targetindex = std::distance(cmpltargetIds.begin(), it); + if (cmplfiltcrit.avgTm > targetIdBestTm[targetindex]) { + targetIdBestTm[targetindex] = cmplfiltcrit.avgTm; + selectedAssIDs[targetindex] = assId; } } - } - for (const auto& pair : cmplIdToBestAssId) { - selectedAssIDs.push_back(pair.second[0]); } + + // for (const auto& pair : cmplIdToBestAssId) { + // selectedAssIDs.push_back(pair.second[0]); + // } resultWrite5.writeStart(thread_idx); for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++) { - unsigned int assId = selectedAssIDs[assIdidx]; - ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); + unsigned int assId = selectedAssIDs.at(assIdidx); + // ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); + ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); unsigned int tComplexId = cmplfiltcrit.targetComplexId; unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); - Complex tComplex = tComplexes[tComplexIdx]; + Complex tComplex = tComplexes.at(tComplexIdx); char *outpos = Itoa::u32toa_sse2(tComplexId, buffer); result.append(buffer, (outpos - buffer - 1)); @@ -691,9 +726,12 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t resultWriter.writeData(result.c_str(), result.length(), qComplexId, thread_idx); resultWrite5.writeEnd(qComplexId, thread_idx); result.clear(); - localComplexMap.clear(); - cmplIdToBestAssId.clear(); - selectedAssIDs.clear(); + // localComplexMap.clear(); + // cmplIdToBestAssId.clear(); + selectedAssIDs.clear(); + localComplexVector.clear(); + cmpltargetIds.clear(); + targetIdBestTm.clear(); } // for end } // MP end @@ -706,9 +744,9 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t delete tDbr; delete tStructDbr; } - qChainKeyToComplexIdMap.clear(); - tChainKeyToComplexIdMap.clear(); - qComplexes.clear(); - tComplexes.clear(); + // qChainKeyToComplexIdMap.clear(); + // tChainKeyToComplexIdMap.clear(); + // qComplexes.clear(); + // tComplexes.clear(); return EXIT_SUCCESS; } \ No newline at end of file From d1605fe794fa594ac319ecc297c90b32e714ea38 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 4 Sep 2024 18:12:08 +0900 Subject: [PATCH 136/160] SIMD for tmscore --- lib/tmalign/basic_fun.h | 41 ++++++++++++++++++++++++++ src/strucclustutils/filtermultimer.cpp | 26 ++++++++-------- 2 files changed, 54 insertions(+), 13 deletions(-) diff --git a/lib/tmalign/basic_fun.h b/lib/tmalign/basic_fun.h index f23b36cc..cf7e418b 100644 --- a/lib/tmalign/basic_fun.h +++ b/lib/tmalign/basic_fun.h @@ -125,4 +125,45 @@ class BasicFunction{ } } + static void do_rotation( std::vector & qx, std::vector & qy, std::vector & qz, + Coordinates & y, + int len, float t[3], float u[3][3]) + { + simd_float t0 = simdf32_set(t[0]); + simd_float t1 = simdf32_set(t[1]); + simd_float t2 = simdf32_set(t[2]); + + simd_float u00 = simdf32_set(u[0][0]); + simd_float u01 = simdf32_set(u[0][1]); + simd_float u02 = simdf32_set(u[0][2]); + simd_float u10 = simdf32_set(u[1][0]); + simd_float u11 = simdf32_set(u[1][1]); + simd_float u12 = simdf32_set(u[1][2]); + simd_float u20 = simdf32_set(u[2][0]); + simd_float u21 = simdf32_set(u[2][1]); + simd_float u22 = simdf32_set(u[2][2]); + for(int i=0; i(omp_get_thread_num()); #endif resultToWrite_t result; - //As scoremultimer prints out after sorting assignment id, map can be replaced with vector. // std::map localComplexMap; std::vector< ComplexFilterCriteria> localComplexVector; // std::map> cmplIdToBestAssId; @@ -668,8 +669,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t if (!(cmplfiltcrit.satisfy(par.covMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, par.filtInterfaceLddtThr, qComplex.nChain, tComplex.nChain))) { continue; } - // TODO: trying to look into the results to make sure what makes more sense - // unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); + unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); // Get the best alignement per each target complex // if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()) { @@ -709,8 +709,8 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t resultWrite5.writeStart(thread_idx); for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++) { unsigned int assId = selectedAssIDs.at(assIdidx); - // ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); - ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); + ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); + // ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); unsigned int tComplexId = cmplfiltcrit.targetComplexId; unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes.at(tComplexIdx); @@ -744,9 +744,9 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t delete tDbr; delete tStructDbr; } - // qChainKeyToComplexIdMap.clear(); - // tChainKeyToComplexIdMap.clear(); - // qComplexes.clear(); - // tComplexes.clear(); + qChainKeyToComplexIdMap.clear(); + tChainKeyToComplexIdMap.clear(); + qComplexes.clear(); + tComplexes.clear(); return EXIT_SUCCESS; } \ No newline at end of file From aa2ced339403b5c96ea5209e6c81da157f343cf8 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 5 Sep 2024 01:47:17 +0900 Subject: [PATCH 137/160] all_seqs.fasta not working --- data/easymultimercluster.sh | 22 +++++++++++----------- src/strucclustutils/filtermultimer.cpp | 6 +++--- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/data/easymultimercluster.sh b/data/easymultimercluster.sh index 810e6d59..7c2068ad 100644 --- a/data/easymultimercluster.sh +++ b/data/easymultimercluster.sh @@ -115,17 +115,17 @@ if notExists "${TMP_PATH}/multimer_rep_seq.fasta"; then fi #TODO: generate fasta file for all sequences -if notExists "${TMP_PATH}/multimer_all_seqs.fasta"; then - # shellcheck disable=SC2086 - "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/multimer_clu" "${TMP_PATH}/multimer_clust_seqs" ${THREADS_PAR} \ - || fail "Result2repseq died" +# if notExists "${TMP_PATH}/multimer_all_seqs.fasta"; then +# # shellcheck disable=SC2086 +# "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/multimer_clu" "${TMP_PATH}/multimer_clust_seqs" ${THREADS_PAR} \ +# || fail "Result2repseq died" - # shellcheck disable=SC2086 - "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clust_seqs" "${TMP_PATH}/multimer_all_seqs.fasta" ${VERBOSITY_PAR} \ - || fail "result2flat died" -fi +# # shellcheck disable=SC2086 +# "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clust_seqs" "${TMP_PATH}/multimer_all_seqs.fasta" ${VERBOSITY_PAR} \ +# || fail "result2flat died" +# fi -mv "${TMP_PATH}/multimer_all_seqs.fasta" "${RESULT}_all_seqs.fasta" +# mv "${TMP_PATH}/multimer_all_seqs.fasta" "${RESULT}_all_seqs.fasta" mv "${TMP_PATH}/multimer_rep_seq.fasta" "${RESULT}_rep_seq.fasta" mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" @@ -134,11 +134,11 @@ if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/multimer_db" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_clu_seqs" ${VERBOSITY_PAR} + # "$MMSEQS" rmdb "${TMP_PATH}/multimer_clu_seqs" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/multimer_rep_seqs" ${VERBOSITY_PAR} # shellcheck disable=SC2086 - "$MMSEQS" rmdb "${TMP_PATH}/complex_rep_seqs_h" ${VERBOSITY_PAR} + "$MMSEQS" rmdb "${TMP_PATH}/multimer_rep_seqs_h" ${VERBOSITY_PAR} # shellcheck disable=SC2086 "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR} # shellcheck disable=SC2086 diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index b8e752e8..7c10defe 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -669,7 +669,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t if (!(cmplfiltcrit.satisfy(par.covMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, par.filtInterfaceLddtThr, qComplex.nChain, tComplex.nChain))) { continue; } - unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); + // unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); // Get the best alignement per each target complex // if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()) { @@ -709,8 +709,8 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t resultWrite5.writeStart(thread_idx); for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++) { unsigned int assId = selectedAssIDs.at(assIdidx); - ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); - // ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); + // ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); + ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); unsigned int tComplexId = cmplfiltcrit.targetComplexId; unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes.at(tComplexIdx); From 3fd787773750a88407b0e37ca0187cae3f38ddb9 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 5 Sep 2024 01:47:38 +0900 Subject: [PATCH 138/160] chaging readme --- README.md | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/README.md b/README.md index 66e1de4d..af3f1614 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Foldseek enables fast and sensitive comparisons of large protein structure sets. - [Important Parameters](#important-cluster-parameters) - [Multimer](#multimersearch) - [Output](#multimer-search-output) + - [MultimerCluster](#multimercluster) - [Main Modules](#main-modules) - [Examples](#examples) @@ -300,6 +301,47 @@ The default output fields are: `query,target,fident,alnlen,mismatch,gapopen,qsta 1tim.pdb.gz 8tim.pdb.gz A,B A,B 0.98941 0.98941 0.999983,0.000332,0.005813,-0.000373,0.999976,0.006884,-0.005811,-0.006886,0.999959 0.298992,0.060047,0.565875 0 ``` +### Multimercluster +The `easy-multimercluster` module is designed for multimer-level structural clustering(supported input formats: PDB/mmCIF, flat or gzipped). By default, easy-multimercluster generates two output files with the following prefixes: (1) `_cluster.tsv` and (2) `_rep_seq.fasta`. The first file (1) is a [tab-separated](#tab-separated_cluster.tsv) file describing the mapping from representative multimer to member, while the second file (2) contains only [representative sequences](#representative-multiemr-fasta). + +#### Output MultimerCluster +##### Tab-separated_cluster.tsv +``` +5o002 5o002 +194l2 194l2 +194l2 193l2 +10mh121 10mh121 +10mh121 10mh114 +10mh121 10mh119 +``` +##### Representative multimer fasta +``` +#5o002 +>5o002_A +SHGK......R +>5o002_B +SHGK......R +#194l2 +>194l2_A0 +KVFG......L +>194l2_A6 +KVFG......L +``` + +#### Important multimer cluster parameters + +| Option | Category | Description | +|-------------------|-----------------|-----------------------------------------------------------------------------------------------------------| +| -e | Sensitivity | List matches below this E-value (range 0.0-inf, default: 0.001); increasing it reports more distant structures | +| --alignment-type| Alignment | 0: 3Di Gotoh-Smith-Waterman (local, not recommended), 1: TMalign (global, slow), 2: 3Di+AA Gotoh-Smith-Waterman (local, default) | +| -c | Alignment | List matches above this fraction of aligned (covered) residues (see --cov-mode) (default: 0.0); higher coverage = more global alignment | +| --cov-mode | Alignment | 0: coverage of query and target, 1: coverage of target, 2: coverage of query | + +| --multimer-tm-threshold | Alignment | accept alignments with an alignment TMscore > thr | +| --chain-tm-threshold | Alignment | accept alignments with an alignment TMscore > thr | +| --interface-lddt-threshold | Alignment | accept alignments with an alignment LDDT score > thr | + + ## Main Modules - `easy-search` fast protein structure search - `easy-cluster` fast protein structure clustering From 420038de86cc5d8fa41bc26961a77fda4d0956a4 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 5 Sep 2024 01:49:37 +0900 Subject: [PATCH 139/160] chaging readme --- README.md | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index af3f1614..6409ab9b 100644 --- a/README.md +++ b/README.md @@ -302,30 +302,30 @@ The default output fields are: `query,target,fident,alnlen,mismatch,gapopen,qsta ``` ### Multimercluster -The `easy-multimercluster` module is designed for multimer-level structural clustering(supported input formats: PDB/mmCIF, flat or gzipped). By default, easy-multimercluster generates two output files with the following prefixes: (1) `_cluster.tsv` and (2) `_rep_seq.fasta`. The first file (1) is a [tab-separated](#tab-separated_cluster.tsv) file describing the mapping from representative multimer to member, while the second file (2) contains only [representative sequences](#representative-multiemr-fasta). +The `easy-multimercluster` module is designed for multimer-level structural clustering(supported input formats: PDB/mmCIF, flat or gzipped). By default, easy-multimercluster generates two output files with the following prefixes: (1) `_cluster.tsv` and (2) `_rep_seq.fasta`. The first file (1) is a [tab-separated] file describing the mapping from representative multimer to member, while the second file (2) contains only [representative sequences]. #### Output MultimerCluster ##### Tab-separated_cluster.tsv ``` -5o002 5o002 -194l2 194l2 -194l2 193l2 -10mh121 10mh121 -10mh121 10mh114 -10mh121 10mh119 +5o002 5o002 +194l2 194l2 +194l2 193l2 +10mh121 10mh121 +10mh121 10mh114 +10mh121 10mh119 ``` ##### Representative multimer fasta ``` #5o002 >5o002_A -SHGK......R +SHGK...R >5o002_B -SHGK......R +SHGK...R #194l2 >194l2_A0 -KVFG......L +KVFG...L >194l2_A6 -KVFG......L +KVFG...L ``` #### Important multimer cluster parameters @@ -336,7 +336,6 @@ KVFG......L | --alignment-type| Alignment | 0: 3Di Gotoh-Smith-Waterman (local, not recommended), 1: TMalign (global, slow), 2: 3Di+AA Gotoh-Smith-Waterman (local, default) | | -c | Alignment | List matches above this fraction of aligned (covered) residues (see --cov-mode) (default: 0.0); higher coverage = more global alignment | | --cov-mode | Alignment | 0: coverage of query and target, 1: coverage of target, 2: coverage of query | - | --multimer-tm-threshold | Alignment | accept alignments with an alignment TMscore > thr | | --chain-tm-threshold | Alignment | accept alignments with an alignment TMscore > thr | | --interface-lddt-threshold | Alignment | accept alignments with an alignment LDDT score > thr | From c3093cd3b7ed89bbff7455d401eb9a6973880276 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 5 Sep 2024 02:41:15 +0900 Subject: [PATCH 140/160] outputs filtcov too --- README.md | 26 ++++++++++++++++++++------ data/easymultimercluster.sh | 6 +++++- data/multimercluster.sh | 4 ++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 6409ab9b..51f9a5ab 100644 --- a/README.md +++ b/README.md @@ -302,10 +302,12 @@ The default output fields are: `query,target,fident,alnlen,mismatch,gapopen,qsta ``` ### Multimercluster -The `easy-multimercluster` module is designed for multimer-level structural clustering(supported input formats: PDB/mmCIF, flat or gzipped). By default, easy-multimercluster generates two output files with the following prefixes: (1) `_cluster.tsv` and (2) `_rep_seq.fasta`. The first file (1) is a [tab-separated] file describing the mapping from representative multimer to member, while the second file (2) contains only [representative sequences]. +The `easy-multimercluster` module is designed for multimer-level structural clustering(supported input formats: PDB/mmCIF, flat or gzipped). By default, easy-multimercluster generates thress output files with the following prefixes: (1) `_cluster.tsv`, (2) `_rep_seq.fasta` and (3) `_cluster_report`. The first file (1) is a [tab-separated](#tab-separated-multimercluster) file describing the mapping from representative multimer to member, while the second file (2) contains only [representative sequences](#representative-multimer-fasta). The third file (3) is also a [tab-separated](#filtered_search_result) file describing filtered alignments. + +Make sure chain names in PDB/mmcIF files does not contain underscores(_). #### Output MultimerCluster -##### Tab-separated_cluster.tsv +##### Tab-separated multimercluster.tsv ``` 5o002 5o002 194l2 194l2 @@ -327,6 +329,13 @@ KVFG...L >194l2_A6 KVFG...L ``` +##### Filtered search result +The `_cluster_report` contains query coverage, target coverage, query multimer Tm, target multimer Tm, interface lddt, ustring, tstring of filtered alignments. +``` +5o0f2 5o0f2 1.000 1.000 1.000 1.000 1.000 1.000,0.000,0.000,0.000,1.000,0.000,0.000,0.000,1.000 0.000,0.000,0.000 +5o0f2 5o0d2 1.000 1.000 0.999 0.992 1.000 0.999,0.000,-0.000,-0.000,0.999,-0.000,0.000,0.000,0.999 -0.004,-0.001,0.084 +5o0f2 5o082 1.000 0.990 0.978 0.962 0.921 0.999,-0.025,-0.002,0.025,0.999,-0.001,0.002,0.001,0.999 -0.039,0.000,-0.253 +``` #### Important multimer cluster parameters @@ -335,15 +344,20 @@ KVFG...L | -e | Sensitivity | List matches below this E-value (range 0.0-inf, default: 0.001); increasing it reports more distant structures | | --alignment-type| Alignment | 0: 3Di Gotoh-Smith-Waterman (local, not recommended), 1: TMalign (global, slow), 2: 3Di+AA Gotoh-Smith-Waterman (local, default) | | -c | Alignment | List matches above this fraction of aligned (covered) residues (see --cov-mode) (default: 0.0); higher coverage = more global alignment | -| --cov-mode | Alignment | 0: coverage of query and target, 1: coverage of target, 2: coverage of query | -| --multimer-tm-threshold | Alignment | accept alignments with an alignment TMscore > thr | -| --chain-tm-threshold | Alignment | accept alignments with an alignment TMscore > thr | -| --interface-lddt-threshold | Alignment | accept alignments with an alignment LDDT score > thr | +| --cov-mode | Alignment | 0: coverage of query and target (cluster multimers only with same chain numbers), 1: coverage of target, 2: coverage of query | +| --multimer-tm-threshold | Alignment | accept alignments with an multimer alignment TMscore > thr | +| --chain-tm-threshold | Alignment | accept alignments if every single chain TMscore > thr | +| --interface-lddt-threshold | Alignment | accept alignments with an interface LDDT score > thr | +``` +foldseek easy-multimercluster example/ clu tmp +``` ## Main Modules - `easy-search` fast protein structure search - `easy-cluster` fast protein structure clustering +- `easy-multimersearch` fast protein multimer-level structure search +- `easy-multimercluster` fast protein multimer-level structure clustering - `createdb` create a database from protein structures (PDB,mmCIF, mmJSON) - `databases` download pre-assembled databases diff --git a/data/easymultimercluster.sh b/data/easymultimercluster.sh index 7c2068ad..eb5b13d7 100644 --- a/data/easymultimercluster.sh +++ b/data/easymultimercluster.sh @@ -96,7 +96,10 @@ SOURCE="${TMP_PATH}/query" INPUT="${TMP_PATH}/latest/multimer_db" if notExists "${TMP_PATH}/cluster.tsv"; then # shellcheck disable=SC2086 - "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \ + || fail "Convert Alignments died" + # shellcheck disable=SC2086 + "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clu_filt_info" "${TMP_PATH}/cluster_report" ${THREADS_PAR} \ || fail "Convert Alignments died" fi @@ -128,6 +131,7 @@ fi # mv "${TMP_PATH}/multimer_all_seqs.fasta" "${RESULT}_all_seqs.fasta" mv "${TMP_PATH}/multimer_rep_seq.fasta" "${RESULT}_rep_seq.fasta" mv "${TMP_PATH}/cluster.tsv" "${RESULT}_cluster.tsv" +mv "${TMP_PATH}/cluster_report" "${RESULT}_cluster_report" if [ -n "${REMOVE_TMP}" ]; then rm "${INPUT}.0" diff --git a/data/multimercluster.sh b/data/multimercluster.sh index 31a553b8..8379968d 100644 --- a/data/multimercluster.sh +++ b/data/multimercluster.sh @@ -115,6 +115,10 @@ if notExists "${RESULT}.dbtype"; then # shellcheck disable=SC2086 "$MMSEQS" clust "${COMP}" "${TMP_PATH}/multimer_filt" "${RESULT}" ${CLUSTER_PAR} \ || fail "Clustering died" + # shellcheck disable=SC2086 + "$MMSEQS" mvdb "${TMP_PATH}/multimer_filt_info" "${RESULT}_filt_info" \ + || fail "mv died" + fi if [ -n "${REMOVE_TMP}" ]; then From 4f1592a44b6bbb79a243de4b362baffe7dc8276a Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 5 Sep 2024 02:47:11 +0900 Subject: [PATCH 141/160] Readme --- README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 51f9a5ab..bba1454b 100644 --- a/README.md +++ b/README.md @@ -306,8 +306,10 @@ The `easy-multimercluster` module is designed for multimer-level structural clus Make sure chain names in PDB/mmcIF files does not contain underscores(_). + foldseek easy-multimercluster example/ clu tmp + #### Output MultimerCluster -##### Tab-separated multimercluster.tsv +##### Tab-separated multimercluster ``` 5o002 5o002 194l2 194l2 @@ -349,10 +351,6 @@ The `_cluster_report` contains query coverage, target coverage, query multimer T | --chain-tm-threshold | Alignment | accept alignments if every single chain TMscore > thr | | --interface-lddt-threshold | Alignment | accept alignments with an interface LDDT score > thr | -``` -foldseek easy-multimercluster example/ clu tmp -``` - ## Main Modules - `easy-search` fast protein structure search - `easy-cluster` fast protein structure clustering From 50c1df1b5358cc11ce814cb62cff323c825cb8fd Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 5 Sep 2024 02:49:03 +0900 Subject: [PATCH 142/160] Readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bba1454b..33ccf872 100644 --- a/README.md +++ b/README.md @@ -302,7 +302,7 @@ The default output fields are: `query,target,fident,alnlen,mismatch,gapopen,qsta ``` ### Multimercluster -The `easy-multimercluster` module is designed for multimer-level structural clustering(supported input formats: PDB/mmCIF, flat or gzipped). By default, easy-multimercluster generates thress output files with the following prefixes: (1) `_cluster.tsv`, (2) `_rep_seq.fasta` and (3) `_cluster_report`. The first file (1) is a [tab-separated](#tab-separated-multimercluster) file describing the mapping from representative multimer to member, while the second file (2) contains only [representative sequences](#representative-multimer-fasta). The third file (3) is also a [tab-separated](#filtered_search_result) file describing filtered alignments. +The `easy-multimercluster` module is designed for multimer-level structural clustering(supported input formats: PDB/mmCIF, flat or gzipped). By default, easy-multimercluster generates thress output files with the following prefixes: (1) `_cluster.tsv`, (2) `_rep_seq.fasta` and (3) `_cluster_report`. The first file (1) is a [tab-separated](#tab-separated-multimercluster) file describing the mapping from representative multimer to member, while the second file (2) contains only [representative sequences](#representative-multimer-fasta). The third file (3) is also a [tab-separated](#filtered-search-result) file describing filtered alignments. Make sure chain names in PDB/mmcIF files does not contain underscores(_). From 4f70b3f47660d667e1b5e49be336ebc7f20ef8b2 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 5 Sep 2024 02:50:07 +0900 Subject: [PATCH 143/160] Readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 33ccf872..6945adc5 100644 --- a/README.md +++ b/README.md @@ -332,7 +332,7 @@ KVFG...L KVFG...L ``` ##### Filtered search result -The `_cluster_report` contains query coverage, target coverage, query multimer Tm, target multimer Tm, interface lddt, ustring, tstring of filtered alignments. +The `_cluster_report` contains `qcoverage, tcoverage, multimer qTm, multimer tTm, interface lddt, ustring, tstring` of filtered alignments. ``` 5o0f2 5o0f2 1.000 1.000 1.000 1.000 1.000 1.000,0.000,0.000,0.000,1.000,0.000,0.000,0.000,1.000 0.000,0.000,0.000 5o0f2 5o0d2 1.000 1.000 0.999 0.992 1.000 0.999,0.000,-0.000,-0.000,0.999,-0.000,0.000,0.000,0.999 -0.004,-0.001,0.084 From de945b2b38eaddaddb496f812cc73908485ff0ff Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Sun, 8 Sep 2024 20:12:34 +0900 Subject: [PATCH 144/160] simd returns segfault --- src/strucclustutils/filtermultimer.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 7c10defe..5cb6a880 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -406,12 +406,12 @@ double computeChainTmScore(AlignedCoordinate &qchain, AlignedCoordinate &tchain, float d02 = d0*d0; Coordinates tmt(alnLen); - BasicFunction::do_rotation(tchain.x, tchain.y, tchain.z, tmt, alnLen, t, u); + // BasicFunction::do_rotation(tchain.x, tchain.y, tchain.z, tmt, alnLen, t, u); for (unsigned int k=0; k Date: Mon, 9 Sep 2024 15:11:41 +0900 Subject: [PATCH 145/160] previous version is twice faster --- src/strucclustutils/MultimerUtil.h | 4 - src/strucclustutils/filtermultimer.cpp | 118 ++++++++++++------------- 2 files changed, 59 insertions(+), 63 deletions(-) diff --git a/src/strucclustutils/MultimerUtil.h b/src/strucclustutils/MultimerUtil.h index 166fca92..4b923a0e 100644 --- a/src/strucclustutils/MultimerUtil.h +++ b/src/strucclustutils/MultimerUtil.h @@ -224,7 +224,6 @@ static ComplexDataHandler parseScoreComplexResult(const char *data, Matcher::res return {assId, qTmScore, tTmScore, uString, tString, true}; } -<<<<<<< HEAD static char* fastfloatToBuffer(float value, char* buffer) { if (value < 0) { value *= -1; @@ -250,6 +249,3 @@ static char* fastfloatToBuffer(float value, char* buffer) { } #endif //FOLDSEEK_MULTIMERUTIL_H -======= -#endif //FOLDSEEK_MULTIMERUTIL_H ->>>>>>> upstream/master diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 5cb6a880..b21d6254 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -554,11 +554,11 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t thread_idx = static_cast(omp_get_thread_num()); #endif resultToWrite_t result; - // std::map localComplexMap; - std::vector< ComplexFilterCriteria> localComplexVector; - // std::map> cmplIdToBestAssId; - std::vector cmpltargetIds; - std::vector targetIdBestTm; + std::map localComplexMap; + // std::vector< ComplexFilterCriteria> localComplexVector; + std::map> cmplIdToBestAssId; + // std::vector cmpltargetIds; + // std::vector targetIdBestTm; std::vector selectedAssIDs; Coordinate16 qcoords; Coordinate16 tcoords; @@ -570,8 +570,9 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t Complex qComplex = qComplexes[qComplexIdx]; unsigned int qComplexId = qComplex.complexId; std::vector qChainKeys = qComplex.chainKeys; - + Debug(Debug::WARNING)<sequenceReader->getId(qChainKey); @@ -616,20 +617,20 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t fillTArr(retComplex.tString, t); unsigned int qalnlen = (std::max(res.qStartPos, res.qEndPos) - std::min(res.qStartPos, res.qEndPos) + 1); unsigned int talnlen = (std::max(res.dbStartPos, res.dbEndPos) - std::min(res.dbStartPos, res.dbEndPos) + 1); - if (localComplexVector.size() <= assId) { - ComplexFilterCriteria cmplfiltcrit(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); - size_t subt = assId - localComplexVector.size(); - for (size_t sub=0; sub < subt; sub ++) { - localComplexVector.push_back(cmplfiltcrit); - } - localComplexVector.push_back(cmplfiltcrit); - } - // if (localComplexMap.find(assId) == localComplexMap.end()) { + // if (localComplexVector.size() <= assId) { // ComplexFilterCriteria cmplfiltcrit(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); - // localComplexMap[assId] = cmplfiltcrit; + // size_t subt = assId - localComplexVector.size(); + // for (size_t sub=0; sub < subt; sub ++) { + // localComplexVector.push_back(cmplfiltcrit); + // } + // localComplexVector.push_back(cmplfiltcrit); // } - // ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); - ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); + if (localComplexMap.find(assId) == localComplexMap.end()) { + ComplexFilterCriteria cmplfiltcrit(tComplexId, retComplex.qTmScore, retComplex.tTmScore, t, u); + localComplexMap[assId] = cmplfiltcrit; + } + ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); + // ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); cmplfiltcrit.updateAln(qalnlen, talnlen); // save Aligned coordinatese if needed : chainTmThr & lddtThr @@ -649,16 +650,16 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t } // Filter the target complexes and get the best alignment - for (unsigned int assId = 0; assId < localComplexVector.size(); assId++) { - // for (auto& assId_res : localComplexMap) { - // unsigned int tComplexId = assId_res.second.targetComplexId; - unsigned int tComplexId = localComplexVector.at(assId).targetComplexId; + // for (unsigned int assId = 0; assId < localComplexVector.size(); assId++) { + for (auto& assId_res : localComplexMap) { + unsigned int tComplexId = assId_res.second.targetComplexId; + // unsigned int tComplexId = localComplexVector.at(assId).targetComplexId; unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes[tComplexIdx]; - // ComplexFilterCriteria &cmplfiltcrit = assId_res.second; - ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); + ComplexFilterCriteria &cmplfiltcrit = assId_res.second; + // ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); cmplfiltcrit.calcCov(qComplex.complexLength, tComplex.complexLength); if (par.filtInterfaceLddtThr > 0.0) { @@ -672,45 +673,44 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t // unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); // Get the best alignement per each target complex - // if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()) { - // // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; - // // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; - // cmplIdToBestAssId[tComplexId] = {static_cast(assId), cmplfiltcrit.avgTm}; - // } - // else { - // // if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]) { - // // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; - // // } - // if (cmplfiltcrit.avgTm > cmplIdToBestAssId.at(tComplexId)[1]) { - // // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; - // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res), cmplfiltcrit.avgTm}; - // } - // } - - unsigned int targetindex; - auto it = std::find(cmpltargetIds.begin(), cmpltargetIds.end(), tComplexId); - if ( it == cmpltargetIds.end()) { - cmpltargetIds.push_back(tComplexId); - selectedAssIDs.push_back(assId); - targetIdBestTm.push_back(cmplfiltcrit.avgTm); + if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()) { + // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; + // cmplIdToBestAssId[tComplexId] = {static_cast(assId), cmplfiltcrit.avgTm}; } else { - targetindex = std::distance(cmpltargetIds.begin(), it); - if (cmplfiltcrit.avgTm > targetIdBestTm[targetindex]) { - targetIdBestTm[targetindex] = cmplfiltcrit.avgTm; - selectedAssIDs[targetindex] = assId; + // if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]) { + // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + // } + if (cmplfiltcrit.avgTm > cmplIdToBestAssId.at(tComplexId)[1]) { + cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; + // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res), cmplfiltcrit.avgTm}; } } + + // unsigned int targetindex; + // auto it = std::find(cmpltargetIds.begin(), cmpltargetIds.end(), tComplexId); + // if ( it == cmpltargetIds.end()) { + // cmpltargetIds.push_back(tComplexId); + // selectedAssIDs.push_back(assId); + // targetIdBestTm.push_back(cmplfiltcrit.avgTm); + // } else { + // targetindex = std::distance(cmpltargetIds.begin(), it); + // if (cmplfiltcrit.avgTm > targetIdBestTm[targetindex]) { + // targetIdBestTm[targetindex] = cmplfiltcrit.avgTm; + // selectedAssIDs[targetindex] = assId; + // } + // } } - // for (const auto& pair : cmplIdToBestAssId) { - // selectedAssIDs.push_back(pair.second[0]); - // } + for (const auto& pair : cmplIdToBestAssId) { + selectedAssIDs.push_back(pair.second[0]); + } resultWrite5.writeStart(thread_idx); for (unsigned int assIdidx = 0; assIdidx < selectedAssIDs.size(); assIdidx++) { unsigned int assId = selectedAssIDs.at(assIdidx); - // ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); - ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); + ComplexFilterCriteria &cmplfiltcrit = localComplexMap.at(assId); + // ComplexFilterCriteria &cmplfiltcrit = localComplexVector.at(assId); unsigned int tComplexId = cmplfiltcrit.targetComplexId; unsigned int tComplexIdx = tComplexIdToIdx.at(tComplexId); Complex tComplex = tComplexes.at(tComplexIdx); @@ -726,12 +726,12 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t resultWriter.writeData(result.c_str(), result.length(), qComplexId, thread_idx); resultWrite5.writeEnd(qComplexId, thread_idx); result.clear(); - // localComplexMap.clear(); - // cmplIdToBestAssId.clear(); + localComplexMap.clear(); + cmplIdToBestAssId.clear(); selectedAssIDs.clear(); - localComplexVector.clear(); - cmpltargetIds.clear(); - targetIdBestTm.clear(); + // localComplexVector.clear(); + // cmpltargetIds.clear(); + // targetIdBestTm.clear(); } // for end } // MP end From 1b39c48253fa5443473c2dae6507e1bd46077041 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 9 Sep 2024 17:08:12 +0900 Subject: [PATCH 146/160] big complex first to prevent big ones to be left and run alone for few hours using 1 thread --- src/strucclustutils/filtermultimer.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index b21d6254..a8f3266c 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -544,6 +544,13 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t getlookupInfo(tLookupFile, tChainKeyToComplexIdMap, tComplexes, tComplexIdToIdx); getComplexResidueLength(tDbr, tComplexes); } + std::vector qComplexOrder(qComplexes.size()); + for (size_t qComplexIdx = 0; qComplexIdx < qComplexes.size(); qComplexIdx++) { + qComplexOrder[qComplexIdx] = qComplexIdx; + } + std::sort(qComplexOrder.begin(), qComplexOrder.end(), [&qComplexes](unsigned int lhs, unsigned int rhs) { + return qComplexes[lhs].chainKeys.size() > qComplexes[rhs].chainKeys.size(); + }); #pragma omp parallel num_threads(localThreads) { @@ -563,16 +570,16 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t Coordinate16 qcoords; Coordinate16 tcoords; - Matcher::result_t res; -#pragma omp for schedule(dynamic, 1) - for (size_t qComplexIdx = 0; qComplexIdx < qComplexes.size(); qComplexIdx++) { + Matcher::result_t res; +#pragma omp for schedule(dynamic, 1) + // for (size_t qComplexIdx = 0; qComplexIdx < qComplexes.size(); qComplexIdx++) { + for (size_t qComplexIdx : qComplexOrder) { + Debug(Debug::WARNING) << qComplexIdx<<"\n"; progress.updateProgress(); Complex qComplex = qComplexes[qComplexIdx]; unsigned int qComplexId = qComplex.complexId; std::vector qChainKeys = qComplex.chainKeys; - Debug(Debug::WARNING)<sequenceReader->getId(qChainKey); @@ -651,6 +658,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t // Filter the target complexes and get the best alignment // for (unsigned int assId = 0; assId < localComplexVector.size(); assId++) { + for (auto& assId_res : localComplexMap) { unsigned int tComplexId = assId_res.second.targetComplexId; // unsigned int tComplexId = localComplexVector.at(assId).targetComplexId; @@ -671,7 +679,6 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t continue; } // unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); - // Get the best alignement per each target complex if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()) { // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; From f1fc6a961f86e5b75753c05603993af443d7eb0e Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 9 Sep 2024 17:40:44 +0900 Subject: [PATCH 147/160] minor --- src/strucclustutils/filtermultimer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index a8f3266c..6fcd6b99 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -574,7 +574,6 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t #pragma omp for schedule(dynamic, 1) // for (size_t qComplexIdx = 0; qComplexIdx < qComplexes.size(); qComplexIdx++) { for (size_t qComplexIdx : qComplexOrder) { - Debug(Debug::WARNING) << qComplexIdx<<"\n"; progress.updateProgress(); Complex qComplex = qComplexes[qComplexIdx]; unsigned int qComplexId = qComplex.complexId; From 1740274806c27b6d63e31d9cfc40a73b28e6cb9e Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 9 Sep 2024 19:42:04 +0900 Subject: [PATCH 148/160] added coverage in Readme --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 098265ac..b2995cf8 100644 --- a/README.md +++ b/README.md @@ -351,7 +351,9 @@ The `_cluster_report` contains `qcoverage, tcoverage, multimer qTm, multimer tTm | --cov-mode | Alignment | 0: coverage of query and target (cluster multimers only with same chain numbers), 1: coverage of target, 2: coverage of query | | --multimer-tm-threshold | Alignment | accept alignments with an multimer alignment TMscore > thr | | --chain-tm-threshold | Alignment | accept alignments if every single chain TMscore > thr | -| --interface-lddt-threshold | Alignment | accept alignments with an interface LDDT score > thr | +| --interface-lddt-threshold | Alignment | accept alignments with an interface LDDT score > thr | + +The coverage here represents the sum of the coverages of all aligned chains, divided by the total query/target lengths. ## Main Modules - `easy-search` fast protein structure search From fd87b160c00664e781f6aa958e2f4fba8815d34f Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 9 Sep 2024 19:44:18 +0900 Subject: [PATCH 149/160] added parameters in example in Readme --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b2995cf8..a7a2b96f 100644 --- a/README.md +++ b/README.md @@ -308,7 +308,7 @@ The `easy-multimercluster` module is designed for multimer-level structural clus Make sure chain names in PDB/mmcIF files does not contain underscores(_). - foldseek easy-multimercluster example/ clu tmp + foldseek easy-multimercluster example/ clu tmp --multimer-tm-threshold 0.65 --chain-tm-threshold 0.5 --interface-tm-threshold 0.65 #### Output MultimerCluster ##### Tab-separated multimercluster @@ -332,6 +332,8 @@ SHGK...R KVFG...L >194l2_A6 KVFG...L +#10mh121 +... ``` ##### Filtered search result The `_cluster_report` contains `qcoverage, tcoverage, multimer qTm, multimer tTm, interface lddt, ustring, tstring` of filtered alignments. @@ -340,6 +342,7 @@ The `_cluster_report` contains `qcoverage, tcoverage, multimer qTm, multimer tTm 5o0f2 5o0d2 1.000 1.000 0.999 0.992 1.000 0.999,0.000,-0.000,-0.000,0.999,-0.000,0.000,0.000,0.999 -0.004,-0.001,0.084 5o0f2 5o082 1.000 0.990 0.978 0.962 0.921 0.999,-0.025,-0.002,0.025,0.999,-0.001,0.002,0.001,0.999 -0.039,0.000,-0.253 ``` +The coverage here represents the sum of the coverages of all aligned chains, divided by the total query/target lengths. #### Important multimer cluster parameters @@ -353,8 +356,6 @@ The `_cluster_report` contains `qcoverage, tcoverage, multimer qTm, multimer tTm | --chain-tm-threshold | Alignment | accept alignments if every single chain TMscore > thr | | --interface-lddt-threshold | Alignment | accept alignments with an interface LDDT score > thr | -The coverage here represents the sum of the coverages of all aligned chains, divided by the total query/target lengths. - ## Main Modules - `easy-search` fast protein structure search - `easy-cluster` fast protein structure clustering From 06016bdd755642827bb7a2c14f041dfac7bdaaa1 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 9 Sep 2024 19:45:46 +0900 Subject: [PATCH 150/160] minor --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a7a2b96f..3ea41d33 100644 --- a/README.md +++ b/README.md @@ -351,10 +351,10 @@ The coverage here represents the sum of the coverages of all aligned chains, div | -e | Sensitivity | List matches below this E-value (range 0.0-inf, default: 0.001); increasing it reports more distant structures | | --alignment-type| Alignment | 0: 3Di Gotoh-Smith-Waterman (local, not recommended), 1: TMalign (global, slow), 2: 3Di+AA Gotoh-Smith-Waterman (local, default) | | -c | Alignment | List matches above this fraction of aligned (covered) residues (see --cov-mode) (default: 0.0); higher coverage = more global alignment | -| --cov-mode | Alignment | 0: coverage of query and target (cluster multimers only with same chain numbers), 1: coverage of target, 2: coverage of query | -| --multimer-tm-threshold | Alignment | accept alignments with an multimer alignment TMscore > thr | -| --chain-tm-threshold | Alignment | accept alignments if every single chain TMscore > thr | -| --interface-lddt-threshold | Alignment | accept alignments with an interface LDDT score > thr | +| --cov-mode | Alignment | 0: coverage of query and target (cluster multimers only with same chain numbers), 1: coverage of target, 2: coverage of query | +| --multimer-tm-threshold | Alignment | accept alignments with an multimer alignment TMscore > thr | +| --chain-tm-threshold | Alignment | accept alignments if every single chain TMscore > thr | +| --interface-lddt-threshold | Alignment | accept alignments with an interface LDDT score > thr | ## Main Modules - `easy-search` fast protein structure search From ac0a32b17260cc39aaf505803cd91c7368141722 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 9 Sep 2024 19:46:25 +0900 Subject: [PATCH 151/160] minor --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 3ea41d33..752486ad 100644 --- a/README.md +++ b/README.md @@ -240,7 +240,6 @@ MCAR...Q | --min-seq-id | Alignment | the minimum sequence identity to be clustered | | --tmscore-threshold | Alignment | accept alignments with an alignment TMscore > thr | | --tmscore-threshold-mode | Alignment | normalize TMscore by 0: alignment, 1: representative, 2: member length | - | --lddt-threshold | Alignment | accept alignments with an alignment LDDT score > thr | From 5b76247b67c2c6a4eb492bc8ec0a9bfd2f8ca4f3 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 9 Sep 2024 19:48:41 +0900 Subject: [PATCH 152/160] minor --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 752486ad..441b7628 100644 --- a/README.md +++ b/README.md @@ -335,13 +335,13 @@ KVFG...L ... ``` ##### Filtered search result -The `_cluster_report` contains `qcoverage, tcoverage, multimer qTm, multimer tTm, interface lddt, ustring, tstring` of filtered alignments. +The `_cluster_report` contains `qcoverage, tcoverage, multimer qTm, multimer tTm, interface lddt, ustring, tstring` of alignments after filtering and before clustering. ``` 5o0f2 5o0f2 1.000 1.000 1.000 1.000 1.000 1.000,0.000,0.000,0.000,1.000,0.000,0.000,0.000,1.000 0.000,0.000,0.000 5o0f2 5o0d2 1.000 1.000 0.999 0.992 1.000 0.999,0.000,-0.000,-0.000,0.999,-0.000,0.000,0.000,0.999 -0.004,-0.001,0.084 5o0f2 5o082 1.000 0.990 0.978 0.962 0.921 0.999,-0.025,-0.002,0.025,0.999,-0.001,0.002,0.001,0.999 -0.039,0.000,-0.253 ``` -The coverage here represents the sum of the coverages of all aligned chains, divided by the total query/target lengths. +The query and target coverages here represent the sum of the coverages of all aligned chains, divided by the total query and target multimer length respectively. #### Important multimer cluster parameters From e1d15f64b339d135f9f33b85838c9fbf4d32c2a3 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 10 Sep 2024 00:37:37 +0900 Subject: [PATCH 153/160] alnLen seems much better --- src/strucclustutils/filtermultimer.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 6fcd6b99..43a986b1 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -563,7 +563,7 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t resultToWrite_t result; std::map localComplexMap; // std::vector< ComplexFilterCriteria> localComplexVector; - std::map> cmplIdToBestAssId; + std::map> cmplIdToBestAssId; // std::vector cmpltargetIds; // std::vector targetIdBestTm; std::vector selectedAssIDs; @@ -657,7 +657,6 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t // Filter the target complexes and get the best alignment // for (unsigned int assId = 0; assId < localComplexVector.size(); assId++) { - for (auto& assId_res : localComplexMap) { unsigned int tComplexId = assId_res.second.targetComplexId; // unsigned int tComplexId = localComplexVector.at(assId).targetComplexId; @@ -677,20 +676,20 @@ localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t if (!(cmplfiltcrit.satisfy(par.covMode, par.covThr, par.filtMultimerTmThr, par.filtChainTmThr, par.filtInterfaceLddtThr, qComplex.nChain, tComplex.nChain))) { continue; } - // unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); + unsigned int alnlen = adjustAlnLen(cmplfiltcrit.qTotalAlnLen, cmplfiltcrit.tTotalAlnLen, par.covMode); // Get the best alignement per each target complex if (cmplIdToBestAssId.find(tComplexId) == cmplIdToBestAssId.end()) { - // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; - cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; + cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; + // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; // cmplIdToBestAssId[tComplexId] = {static_cast(assId), cmplfiltcrit.avgTm}; } else { - // if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]) { - // cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; - // } - if (cmplfiltcrit.avgTm > cmplIdToBestAssId.at(tComplexId)[1]) { - cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; - // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res), cmplfiltcrit.avgTm}; + if (alnlen > cmplIdToBestAssId.at(tComplexId)[1]) { + cmplIdToBestAssId[tComplexId] = {assId_res.first, alnlen}; } + // if (cmplfiltcrit.avgTm > cmplIdToBestAssId.at(tComplexId)[1]) { + // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res.first), cmplfiltcrit.avgTm}; + // // cmplIdToBestAssId[tComplexId] = {static_cast(assId_res), cmplfiltcrit.avgTm}; + // } } // unsigned int targetindex; From af1e86e5a00c771a52260e50b64e4bf7102e17e4 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Tue, 10 Sep 2024 15:37:57 +0900 Subject: [PATCH 154/160] default parameters --- src/workflow/MultimerCluster.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/workflow/MultimerCluster.cpp b/src/workflow/MultimerCluster.cpp index 2fa5a3f4..688aac43 100644 --- a/src/workflow/MultimerCluster.cpp +++ b/src/workflow/MultimerCluster.cpp @@ -8,9 +8,9 @@ #include "multimercluster.sh.h" void setMultimerClusterDefaults(LocalParameters *p) { - p->filtMultimerTmThr = 0.5; // FIX - // p->filtChainTmThr=0.0; // FIX - // p->filtInterfaceLddtThr = 0.0; // FIX + p->filtMultimerTmThr = 0.65; // FIX + p->filtChainTmThr = 0.001; // FIX + p->filtInterfaceLddtThr = 0.5; // FIX } int multimercluster(int argc, const char **argv, const Command &command) { From e9b0f2348f4d5b3dc24065a5172a7bea79ba0b5e Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 11 Sep 2024 16:38:22 +0900 Subject: [PATCH 155/160] check alinged chain num when interfacelddt --- src/strucclustutils/filtermultimer.cpp | 39 ++++++++++++++++---------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index 43a986b1..e955852b 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -161,28 +161,37 @@ class ComplexFilterCriteria { return true; } - void calculateAvgTm(int covMode){ - switch (covMode) { - case Parameters::COV_MODE_BIDIRECTIONAL: - avgTm = ( qTm + tTm ) / 2 ; - break; - case Parameters::COV_MODE_TARGET: - avgTm = tTm ; - break; - case Parameters::COV_MODE_QUERY: - avgTm = qTm ; - break; - default : - avgTm = ( qTm + tTm ) / 2 ; + // void calculateAvgTm(int covMode){ + // switch (covMode) { + // case Parameters::COV_MODE_BIDIRECTIONAL: + // avgTm = ( qTm + tTm ) / 2 ; + // break; + // case Parameters::COV_MODE_TARGET: + // avgTm = tTm ; + // break; + // case Parameters::COV_MODE_QUERY: + // avgTm = qTm ; + // break; + // default : + // avgTm = ( qTm + tTm ) / 2 ; + // } + // } + + void hasInterfaceLDDT(float iLddtThr) { + if (qAlnChainTms.size()= iLddtThr); } + } bool satisfy(int covMode, float covThr, float TmThr, float chainTmThr, float iLddtThr, size_t qChainNum, size_t tChainNum ) { const bool covOK = covThr ? Util::hasCoverage(covThr, covMode, qCov, tCov) : true; const bool TmOK = TmThr ? hasTm(TmThr, covMode) : true; const bool chainTmOK = chainTmThr ? hasChainTm(chainTmThr, covMode, qChainNum, tChainNum) : true; const bool chainNumOK = hasChainNum(covMode, qChainNum, tChainNum); - const bool lddtOK = iLddtThr ? (interfaceLddt >= iLddtThr) : true; - calculateAvgTm(covMode); + const bool lddtOK = iLddtThr ? hasInterfaceLDDT(iLddtThr) : true; + // calculateAvgTm(covMode); return (covOK && TmOK && chainTmOK && lddtOK && chainNumOK); } From 7112fccb26007ce04da809c203817d8eb6bc2193 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 12 Sep 2024 18:26:46 +0900 Subject: [PATCH 156/160] minor --- src/strucclustutils/filtermultimer.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index e955852b..df17d683 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -182,8 +182,6 @@ class ComplexFilterCriteria { return false; } return(interfaceLddt >= iLddtThr); - } - } bool satisfy(int covMode, float covThr, float TmThr, float chainTmThr, float iLddtThr, size_t qChainNum, size_t tChainNum ) { const bool covOK = covThr ? Util::hasCoverage(covThr, covMode, qCov, tCov) : true; From 319144b4004f0c5bdcfea9f937ed102334b1f7cb Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 12 Sep 2024 18:28:47 +0900 Subject: [PATCH 157/160] minor --- src/strucclustutils/filtermultimer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strucclustutils/filtermultimer.cpp b/src/strucclustutils/filtermultimer.cpp index df17d683..09b40d19 100644 --- a/src/strucclustutils/filtermultimer.cpp +++ b/src/strucclustutils/filtermultimer.cpp @@ -177,7 +177,7 @@ class ComplexFilterCriteria { // } // } - void hasInterfaceLDDT(float iLddtThr) { + bool hasInterfaceLDDT(float iLddtThr, unsigned int qChainNum, unsigned int tChainNum) { if (qAlnChainTms.size() Date: Thu, 19 Sep 2024 16:18:28 +0900 Subject: [PATCH 158/160] typo in Readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 441b7628..11ce4e29 100644 --- a/README.md +++ b/README.md @@ -303,7 +303,7 @@ The default output fields are: `query,target,fident,alnlen,mismatch,gapopen,qsta ``` ### Multimercluster -The `easy-multimercluster` module is designed for multimer-level structural clustering(supported input formats: PDB/mmCIF, flat or gzipped). By default, easy-multimercluster generates thress output files with the following prefixes: (1) `_cluster.tsv`, (2) `_rep_seq.fasta` and (3) `_cluster_report`. The first file (1) is a [tab-separated](#tab-separated-multimercluster) file describing the mapping from representative multimer to member, while the second file (2) contains only [representative sequences](#representative-multimer-fasta). The third file (3) is also a [tab-separated](#filtered-search-result) file describing filtered alignments. +The `easy-multimercluster` module is designed for multimer-level structural clustering(supported input formats: PDB/mmCIF, flat or gzipped). By default, easy-multimercluster generates three output files with the following prefixes: (1) `_cluster.tsv`, (2) `_rep_seq.fasta` and (3) `_cluster_report`. The first file (1) is a [tab-separated](#tab-separated-multimercluster) file describing the mapping from representative multimer to member, while the second file (2) contains only [representative sequences](#representative-multimer-fasta). The third file (3) is also a [tab-separated](#filtered-search-result) file describing filtered alignments. Make sure chain names in PDB/mmcIF files does not contain underscores(_). @@ -351,7 +351,7 @@ The query and target coverages here represent the sum of the coverages of all al | --alignment-type| Alignment | 0: 3Di Gotoh-Smith-Waterman (local, not recommended), 1: TMalign (global, slow), 2: 3Di+AA Gotoh-Smith-Waterman (local, default) | | -c | Alignment | List matches above this fraction of aligned (covered) residues (see --cov-mode) (default: 0.0); higher coverage = more global alignment | | --cov-mode | Alignment | 0: coverage of query and target (cluster multimers only with same chain numbers), 1: coverage of target, 2: coverage of query | -| --multimer-tm-threshold | Alignment | accept alignments with an multimer alignment TMscore > thr | +| --multimer-tm-threshold | Alignment | accept alignments with multimer alignment TMscore > thr | | --chain-tm-threshold | Alignment | accept alignments if every single chain TMscore > thr | | --interface-lddt-threshold | Alignment | accept alignments with an interface LDDT score > thr | From 8711f6b72ded828128394e07699ecce948e1db6e Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 19 Sep 2024 17:07:56 +0900 Subject: [PATCH 159/160] minor things --- lib/tmalign/basic_fun.h | 42 --------------------------------- src/commons/LocalParameters.cpp | 18 ++++++-------- src/commons/LocalParameters.h | 6 ++--- 3 files changed, 10 insertions(+), 56 deletions(-) diff --git a/lib/tmalign/basic_fun.h b/lib/tmalign/basic_fun.h index cf7e418b..bc2b53fb 100644 --- a/lib/tmalign/basic_fun.h +++ b/lib/tmalign/basic_fun.h @@ -124,46 +124,4 @@ class BasicFunction{ // transform(t, u, x.x[i], x.y[i], x.z[i], y.x[i], y.y[i], y.z[i]); } } - - static void do_rotation( std::vector & qx, std::vector & qy, std::vector & qz, - Coordinates & y, - int len, float t[3], float u[3][3]) - { - simd_float t0 = simdf32_set(t[0]); - simd_float t1 = simdf32_set(t[1]); - simd_float t2 = simdf32_set(t[2]); - - simd_float u00 = simdf32_set(u[0][0]); - simd_float u01 = simdf32_set(u[0][1]); - simd_float u02 = simdf32_set(u[0][2]); - simd_float u10 = simdf32_set(u[1][0]); - simd_float u11 = simdf32_set(u[1][1]); - simd_float u12 = simdf32_set(u[1][2]); - simd_float u20 = simdf32_set(u[2][0]); - simd_float u21 = simdf32_set(u[2][1]); - simd_float u22 = simdf32_set(u[2][2]); - for(int i=0; i Date: Fri, 20 Sep 2024 16:11:46 +0900 Subject: [PATCH 160/160] order in LocalParameters --- src/commons/LocalParameters.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index a22e574b..829d3633 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -194,13 +194,13 @@ LocalParameters::LocalParameters() : scoremultimer.push_back(&PARAM_V); //filtermultimer - filtermultimer.push_back(&PARAM_V); - filtermultimer.push_back(&PARAM_THREADS); filtermultimer.push_back(&PARAM_C); filtermultimer.push_back(&PARAM_COV_MODE); filtermultimer.push_back(&PARAM_MULTIMER_TM_THRESHOLD); filtermultimer.push_back(&PARAM_CHAIN_TM_THRESHOLD); filtermultimer.push_back(&PARAM_INTERFACE_LDDT_THRESHOLD); + filtermultimer.push_back(&PARAM_THREADS); + filtermultimer.push_back(&PARAM_V); // createmultimerreport createmultimerreport.push_back(&PARAM_DB_OUTPUT); @@ -265,8 +265,8 @@ LocalParameters::LocalParameters() : dbSuffixList = "_h,_ss,_ca"; indexExclude = 0; multimerReportMode = 1; - eValueThrExpandMultimer = 10000.0; - prostt5Model = ""; + eValueThrExpandMultimer = 10000.0; + prostt5Model = ""; gpu = 0; filtMultimerTmThr = 0.0; filtChainTmThr = 0.0;