Merge pull request #353 from rachelse/master

Multimer cluster
steineggerlab · Sep 20, 2024 · 232a4c4 · 232a4c4
2 parents 8809363 + 19595fd
commit 232a4c4
Show file tree

Hide file tree

Showing 15 changed files with 1,383 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@ Foldseek enables fast and sensitive comparisons of large protein structure sets.
     - [Important Parameters](#important-cluster-parameters)
   - [Multimer](#multimersearch)
     - [Output](#multimer-search-output)
+  - [MultimerCluster](#multimercluster)
 - [Main Modules](#main-modules)
 - [Examples](#examples)
 
@@ -239,7 +240,6 @@ MCAR...Q
 | --min-seq-id      | Alignment  | the minimum sequence identity to be clustered                               |
 | --tmscore-threshold      | Alignment  | accept alignments with an alignment TMscore > thr                               |
 | --tmscore-threshold-mode    | Alignment  | normalize TMscore by 0: alignment, 1: representative, 2: member length                             |
-
 | --lddt-threshold      | Alignment  | accept alignments with an alignment LDDT score > thr                               |
 
 
@@ -302,9 +302,64 @@ The default output fields are: `query,target,fident,alnlen,mismatch,gapopen,qsta
 1tim.pdb.gz 8tim.pdb.gz A,B A,B 0.98941 0.98941 0.999983,0.000332,0.005813,-0.000373,0.999976,0.006884,-0.005811,-0.006886,0.999959 0.298992,0.060047,0.565875  0
 ```
 
+### Multimercluster
+The `easy-multimercluster` module is designed for multimer-level structural clustering(supported input formats: PDB/mmCIF, flat or gzipped). By default, easy-multimercluster generates three output files with the following prefixes: (1) `_cluster.tsv`, (2) `_rep_seq.fasta` and (3) `_cluster_report`.  The first file (1) is a [tab-separated](#tab-separated-multimercluster) file describing the mapping from representative multimer to member, while the second file (2) contains only [representative sequences](#representative-multimer-fasta). The third file (3) is also a [tab-separated](#filtered-search-result) file describing filtered alignments.
+
+Make sure chain names in PDB/mmcIF files does not contain underscores(_).
+
+    foldseek easy-multimercluster example/ clu tmp --multimer-tm-threshold 0.65 --chain-tm-threshold 0.5 --interface-tm-threshold 0.65
+
+#### Output MultimerCluster
+##### Tab-separated multimercluster
+```
+5o002	   5o002
+194l2	   194l2
+194l2	   193l2
+10mh121	 10mh121
+10mh121	 10mh114
+10mh121	 10mh119
+```
+##### Representative multimer fasta
+```
+#5o002
+>5o002_A
+SHGK...R
+>5o002_B
+SHGK...R
+#194l2
+>194l2_A0
+KVFG...L
+>194l2_A6
+KVFG...L
+#10mh121
+...
+```
+##### Filtered search result
+The `_cluster_report` contains `qcoverage, tcoverage, multimer qTm, multimer tTm, interface lddt, ustring, tstring` of alignments after filtering and before clustering. 
+```
+5o0f2	5o0f2	1.000	1.000	1.000	1.000	1.000	1.000,0.000,0.000,0.000,1.000,0.000,0.000,0.000,1.000	0.000,0.000,0.000
+5o0f2	5o0d2	1.000	1.000	0.999	0.992	1.000	0.999,0.000,-0.000,-0.000,0.999,-0.000,0.000,0.000,0.999	-0.004,-0.001,0.084
+5o0f2	5o082	1.000	0.990	0.978	0.962	0.921	0.999,-0.025,-0.002,0.025,0.999,-0.001,0.002,0.001,0.999	-0.039,0.000,-0.253
+```
+The query and target coverages here represent the sum of the coverages of all aligned chains, divided by the total query and target multimer length respectively.
+
+#### Important multimer cluster parameters
+
+| Option            | Category        | Description                                                                                               |
+|-------------------|-----------------|-----------------------------------------------------------------------------------------------------------|
+| -e              | Sensitivity     | List matches below this E-value (range 0.0-inf, default: 0.001); increasing it reports more distant structures |
+| --alignment-type| Alignment       | 0: 3Di Gotoh-Smith-Waterman (local, not recommended), 1: TMalign (global, slow), 2: 3Di+AA Gotoh-Smith-Waterman (local, default) |
+| -c              | Alignment  | List matches above this fraction of aligned (covered) residues (see --cov-mode) (default: 0.0); higher coverage = more global alignment |
+| --cov-mode      | Alignment  | 0: coverage of query and target (cluster multimers only with same chain numbers), 1: coverage of target, 2: coverage of query |
+| --multimer-tm-threshold      | Alignment  | accept alignments with multimer alignment TMscore > thr |
+| --chain-tm-threshold      | Alignment  | accept alignments if every single chain TMscore > thr |
+| --interface-lddt-threshold      | Alignment  | accept alignments with an interface LDDT score > thr |
+
 ## Main Modules
 - `easy-search`       fast protein structure search  
 - `easy-cluster`      fast protein structure clustering  
+- `easy-multimersearch`       fast protein multimer-level structure search  
+- `easy-multimercluster`       fast protein multimer-level structure clustering  
 - `createdb`          create a database from protein structures (PDB,mmCIF, mmJSON)
 - `databases`         download pre-assembled databases
 

diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt
@@ -15,6 +15,8 @@ set(COMPILED_RESOURCES
         vendor.js.zst
         multimersearch.sh
         easymultimersearch.sh
+        multimercluster.sh
+        easymultimercluster.sh
         )
 
 set(GENERATED_OUTPUT_HEADERS "")

diff --git a/data/easymultimercluster.sh b/data/easymultimercluster.sh
@@ -0,0 +1,163 @@
+#!/bin/sh -e
+fail() {
+    echo "Error: $1"
+    exit 1
+}
+
+notExists() {
+	[ ! -f "$1" ]
+}
+
+exists() {
+	[ -f "$1" ]
+}
+
+abspath() {
+    if [ -d "$1" ]; then
+        (cd "$1"; pwd)
+    elif [ -f "$1" ]; then
+        if [ -z "${1##*/*}" ]; then
+            echo "$(cd "${1%/*}"; pwd)/${1##*/}"
+        else
+            echo "$(pwd)/$1"
+        fi
+    elif [ -d "$(dirname "$1")" ]; then
+        echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")"
+    fi
+}
+
+mapCmplName2ChainKeys() {
+    awk -F"\t" 'FNR==1 {++fIndex}
+        fIndex==1 {
+            repName[$1]=1
+            if (match($1, /MODEL/)){
+                tmpName[$1]=1
+            }else{
+                tmpName[$1"_MODEL_1"]=1 
+            }
+            next
+        }
+        fIndex==2{
+            if (match($2, /MODEL/)){
+                if ($2 in tmpName){
+                repId[$1]=1
+                }else{
+                    ho[1]=1
+                }
+            }else{
+                if ($2 in repName){
+                repId[$1]=1
+                }
+            }
+            next
+        }
+        {
+            if ($3 in repId){
+                print $1
+            }
+        }
+    ' "${1}" "${2}.source" "${2}.lookup" > "${3}"
+}
+
+postprocessFasta() {
+    awk ' BEGIN {FS=">"}
+    $0 ~/^>/ {
+        # match($2, /(.*).pdb*/)
+        split($2,parts,"_")
+        complex=""
+        for (j = 1; j < length(parts); j++) {
+            complex = complex parts[j]
+            if (j < length(parts)-1){
+                complex=complex"_" 
+            }
+        }
+        if (!(complex in repComplex)) {
+            print "#"complex
+            repComplex[complex] = ""
+        }
+    }
+    {print $0}
+    ' "${1}" > "${1}.tmp" && mv "${1}.tmp" "${1}"
+}
+
+if notExists "${TMP_PATH}/query.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/query" ${CREATEDB_PAR} \
+        || fail "query createdb died"
+fi
+
+if notExists "${TMP_PATH}/multimer_clu.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" multimercluster "${TMP_PATH}/query" "${TMP_PATH}/multimer_clu" "${TMP_PATH}" ${MULTIMERCLUSTER_PAR} \
+        || fail "Multimercluster died"
+fi
+
+SOURCE="${TMP_PATH}/query"
+INPUT="${TMP_PATH}/latest/multimer_db"
+if notExists "${TMP_PATH}/cluster.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR}   \
+        || fail "Convert Alignments died"
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clu_filt_info" "${TMP_PATH}/cluster_report" ${THREADS_PAR}  \
+        || fail "Convert Alignments died"
+fi
+
+if notExists "${TMP_PATH}/multimer_rep_seqs.dbtype"; then
+    mapCmplName2ChainKeys "${TMP_PATH}/cluster.tsv" "${SOURCE}" "${TMP_PATH}/rep_seqs.list" 
+    # shellcheck disable=SC2086
+    "$MMSEQS" createsubdb "${TMP_PATH}/rep_seqs.list" "${SOURCE}" "${TMP_PATH}/multimer_rep_seqs" ${CREATESUBDB_PAR} \
+        || fail "createsubdb died"
+fi
+
+if notExists "${TMP_PATH}/multimer_rep_seq.fasta"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" result2flat "${SOURCE}" "${SOURCE}"  "${TMP_PATH}/multimer_rep_seqs" "${TMP_PATH}/multimer_rep_seq.fasta" ${VERBOSITY_PAR} \
+            || fail "result2flat died"
+    postprocessFasta "${TMP_PATH}/multimer_rep_seq.fasta"
+fi
+
+#TODO: generate fasta file for all sequences
+# if notExists "${TMP_PATH}/multimer_all_seqs.fasta"; then
+#     # shellcheck disable=SC2086
+#     "$MMSEQS" createseqfiledb "${INPUT}" "${TMP_PATH}/multimer_clu" "${TMP_PATH}/multimer_clust_seqs" ${THREADS_PAR} \
+#             || fail "Result2repseq  died"
+
+#     # shellcheck disable=SC2086
+#     "$MMSEQS" result2flat "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_clust_seqs" "${TMP_PATH}/multimer_all_seqs.fasta" ${VERBOSITY_PAR} \
+#             || fail "result2flat died"
+# fi
+
+# mv "${TMP_PATH}/multimer_all_seqs.fasta"  "${RESULT}_all_seqs.fasta"
+mv "${TMP_PATH}/multimer_rep_seq.fasta"  "${RESULT}_rep_seq.fasta"
+mv "${TMP_PATH}/cluster.tsv"  "${RESULT}_cluster.tsv"
+mv "${TMP_PATH}/cluster_report"  "${RESULT}_cluster_report"
+
+if [ -n "${REMOVE_TMP}" ]; then
+    rm "${INPUT}.0"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/multimer_db" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    # "$MMSEQS" rmdb "${TMP_PATH}/multimer_clu_seqs" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/multimer_rep_seqs" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/multimer_rep_seqs_h" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/complex_clu" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/query" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/query_h" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${INPUT}" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${INPUT}_h" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/query_ca" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/query_ss" ${VERBOSITY_PAR}
+    rm "${TMP_PATH}/rep_seqs.list"
+    rm -rf "${TMP_PATH}/latest"
+    rm -f "${TMP_PATH}/easymultimercluster.sh"
+fi
diff --git a/data/multimercluster.sh b/data/multimercluster.sh
@@ -0,0 +1,133 @@
+#!/bin/sh -e
+fail() {
+    echo "Error: $1"
+    exit 1
+}
+
+notExists() {
+	[ ! -f "$1" ]
+}
+
+exists() {
+	[ -f "$1" ]
+}
+
+abspath() {
+    if [ -d "$1" ]; then
+        (cd "$1"; pwd)
+    elif [ -f "$1" ]; then
+        if [ -z "${1##*/*}" ]; then
+            echo "$(cd "${1%/*}"; pwd)/${1##*/}"
+        else
+            echo "$(pwd)/$1"
+        fi
+    elif [ -d "$(dirname "$1")" ]; then
+        echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")"
+    fi
+}
+
+# Shift initial DB to complexDB using soft-linking
+# $1: input db
+# $2: output db
+buildCmplDb() {
+    touch "${2}"
+    awk -F"\t" 'BEGIN {OFFSET=0}
+        FNR==NR{chain_len[$1]=$3;next}
+        {
+            if (!($3 in off_arr)) {
+                off_arr[$3]=OFFSET
+            }
+            cmpl_len[$3]+=chain_len[$1];OFFSET+=chain_len[$1]
+        }
+        END {
+            for (cmpl in off_arr) {
+                print cmpl"\t"off_arr[cmpl]"\t"cmpl_len[cmpl]
+            }
+        }' "${1}.index" "${1}.lookup" > "${2}.index"
+    ln -s "$(abspath "${1}")" "${2}.0"
+    cp "${1}.dbtype" "${2}.dbtype"
+}
+
+buldCmplhDb(){
+    awk -F"\t" 'BEGIN {INDEXVAL=0}
+        {
+            split($2,words," ")
+            split(words[1],parts,"_")
+            output_string=parts[1]
+            for (j = 2; j < length(parts); j++) {
+                if (j < length(parts)){
+                    output_string=output_string"_" 
+                }
+                output_string = output_string parts[j]
+            }
+            headerstring=""
+            for (k = 2; k < length(words)+1; k++) {
+                headerstring = headerstring words[k]" "
+            }
+            if (!(output_string in gogo)){
+                print INDEXVAL"\t"output_string" "headerstring
+                INDEXVAL++
+            }
+            gogo[output_string]=1
+            
+        }' "${1}" > "${2}"
+}
+
+
+# [ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "${TMP_PATH}";
+
+if notExists "${TMP_PATH}/multimer_result.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" multimersearch "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_result" "${TMP_PATH}/multimersearch_tmp" ${MULTIMERSEARCH_PAR} \
+        || fail "multimerSearch died"
+fi
+
+if notExists "multimer_filt.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" filtermultimer "${INPUT}" "${INPUT}" "${TMP_PATH}/multimer_result" "${TMP_PATH}/multimer_filt" ${FILTERMULTIMER_PAR} \
+        || fail "FilterMultimer died"
+fi
+
+# shift query DB, .index, .dbtype
+if notExists "${TMP_PATH}/multimer_db.dbtype"; then    
+    # build complex db as output
+    buildCmplDb "${INPUT}" "${TMP_PATH}/multimer_db"
+fi
+
+# Shift _h, _h.dbtype
+if notExists "${TMP_PATH}/multimer_db_h.dbtype"; then
+    # # shellcheck disable=SC2086
+    # "$MMSEQS" tsv2db "${INPUT}.source" "${TMP_PATH}/complex_db_header_tmp" ${VERBOSITY_PAR} \
+    #     || fail "tsv2db died"
+    # shellcheck disable=SC2086
+    # "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h.tsv" ${VERBOSITY_PAR} \
+    "$MMSEQS" createtsv "${INPUT}" "${INPUT}_h" "${TMP_PATH}/chain_db_h.tsv" --threads 1 \
+        || fail "createtsv died"
+    buldCmplhDb "${TMP_PATH}/chain_db_h.tsv" "${TMP_PATH}/multimer_header.tsv"
+    # shellcheck disable=SC2086
+    "$MMSEQS" tsv2db "${TMP_PATH}/multimer_header.tsv" "${TMP_PATH}/multimer_db_h" ${VERBOSITY_PAR} \
+        || fail "tsv2db died"
+fi
+
+COMP="${TMP_PATH}/multimer_db"
+
+if notExists "${RESULT}.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" clust "${COMP}" "${TMP_PATH}/multimer_filt" "${RESULT}" ${CLUSTER_PAR} \
+        || fail "Clustering died"
+    # shellcheck disable=SC2086
+    "$MMSEQS" mvdb "${TMP_PATH}/multimer_filt_info" "${RESULT}_filt_info" \
+        || fail "mv died"
+
+fi
+
+if [ -n "${REMOVE_TMP}" ]; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/multimer_filt" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/multimer_result" ${VERBOSITY_PAR}
+    rm "${TMP_PATH}/chain_db_h.tsv"
+    rm "${TMP_PATH}/multimer_header.tsv"
+    rm -rf "${TMP_PATH}/multimersearch_tmp"
+    rm -f "${TMP_PATH}/multimercluster.sh"
+fi
diff --git a/lib/tmalign/basic_fun.h b/lib/tmalign/basic_fun.h
@@ -124,5 +124,4 @@ class BasicFunction{
 //        transform(t, u, x.x[i], x.y[i], x.z[i], y.x[i], y.y[i], y.z[i]);
         }
     }
-
 };