Merge pull request #4 from jhuapl-bio/prerelease

Prerelease
jhuapl-bio · Aug 12, 2021 · ad3e6c9 · ad3e6c9
2 parents 9440416 + 1074981
commit ad3e6c9
Show file tree

Hide file tree

Showing 19 changed files with 166 additions and 46 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+databases/* filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/main.yml → ...orkflows/build_push_docker-prerelease.yml b/.github/workflows/main.yml → ...orkflows/build_push_docker-prerelease.yml
@@ -9,16 +9,18 @@ jobs:
     name: Build and push Docker image for pre-release
     runs-on: ubuntu-latest
     steps:
-
       - name: Check out the repo
         uses: actions/checkout@v2
-
+        with:
+          lfs: true
       - name: Push to Docker Hub
         uses: docker/build-push-action@v1
         with:
+          name: ${{ secrets.DOCKERHUB_REPOSITORY }}
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_PASSWORD }}
           repository: ${{ secrets.DOCKERHUB_REPOSITORY }}
           tag_with_ref: true
           tag_with_sha: true
           tags: prerelease
+          lfs: true
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,46 @@
+FROM continuumio/miniconda3:4.9.2
+
+WORKDIR /opt/software
+
+# Make RUN commands use `bash --login` (always source ~/.bashrc on each RUN)
+SHELL ["/bin/bash", "--login", "-c"]
+
+# install apt dependencies and update conda
+RUN apt-get update && apt-get install git -y \
+    && apt-get install -y apt-transport-https ca-certificates wget unzip bzip2 libfontconfig1 \
+    && update-ca-certificates \
+    && apt-get -qq -y remove curl \
+    && apt-get -qq -y autoremove \
+    && apt-get autoclean \
+    && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log 
+
+ENV PATH /opt/conda/bin:$PATH
+RUN conda config --set ssl_verify no
+COPY ./environment.yml /opt/environment.yml
+
+RUN conda env create -f /opt/environment.yml
+
+COPY databases /opt/databases
+WORKDIR /opt/databases
+RUN wget http://ccb.jhu.edu/software/kraken/dl/minikraken_20171019_4GB.tgz
+RUN tar -xvzf minikraken_20171019_4GB.tgz 
+RUN mkdir -p /opt/databases && \
+    mv minikraken_20171013_4GB /opt/databases/minikraken && \
+    rm minikraken_20171019_4GB.tgz
+RUN find /opt/databases -name "*tar.gz" -exec tar -xvzf {} \;
+WORKDIR /opt/software/mytax
+COPY src /opt/software/mytax
+RUN find . -name "*.sh" | while read fn; do ln -s $PWD/$fn /usr/local/bin; done 
+RUN conda activate mytax && bash process_krakendb.sh -k /opt/databases/minikraken
+
+
+
+# RUN conda activate mytax && \
+#     bash build_flukraken.sh -k flukraken && \
+#     rm -r flukraken/library flukraken/raw flukraken/database.jdb* && \
+#     tar c flukraken | gzip -c | tee flukraken.tar.gz && \
+#     rm -rf flukraken
+COPY sunburst /opt/software/mytax/sunburst
+
+
+
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Clone this repo with:
 
 Symbolically link all shell scripts into your path, for example with:
 
-`find mytax -name "*.sh" | while read fn; do sudo ln -s $PWD/$fn /usr/local/bin; done`
+`find mytax/src -name "*.sh" | while read fn; do sudo ln -s $PWD/$fn /usr/local/bin; done`
 
 # Dependencies
 
@@ -26,7 +26,7 @@ Symbolically link all shell scripts into your path, for example with:
 
 ## Building example
 
-This pipeline is built from a central set of scripts.
+This pipeline is built from a central set of scripts located in the `src` directory
 
 Build flu-kraken example with:
 

diff --git a/databases/flukraken.tar.gz b/databases/flukraken.tar.gz
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -0,0 +1,7 @@
+#!/bin/bash --login
+# The --login ensures the bash configuration is loaded,
+# enabling Conda.
+set +u
+source /opt/conda/etc/profile.d/conda.sh
+conda activate mytax
+
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,19 @@
+name: mytax
+channels:
+  - conda-forge
+  - bioconda
+  - anaconda
+  - defaults
+dependencies:
+  - gawk=5.1.0
+  - gettext=0.19.8.1
+  - jellyfish=1.1.12
+  - libcxx=10.0.0
+  - libffi=3.3
+  - libiconv=1.16
+  - make=4.3
+  - ncurses=6.2
+  - perl=5.26.2
+  - readline=8.0
+  - kraken=1.1
+  - centrifuge=1.0.3
diff --git a/build_IVR_metadata.sh → src/build_IVR_metadata.sh b/build_IVR_metadata.sh → src/build_IVR_metadata.sh
@@ -58,6 +58,7 @@ usage() {
 	echo -e ""
 	echo -e "OPTIONS:"
 	echo -e "    -h     show this message"
+	echo -e "    -c      classifier [kraken, centrifuge]"
 	echo -e "    -i     input IVR metadata file (influenza_na.dat)"
 	echo -e "    -f     input IVR FASTA file (influenza.fna)"
 	echo -e "    -o     output metadata table for taxonomy creation"
@@ -80,7 +81,7 @@ logfile="/dev/null"
 tempdir="/tmp"
 prefix=""
 outliers="KT777860" # comma-separated list of known outliers to exclude from classification
-
+CMD="kraken"
 #---------------------------------------------------------------------------------------------------
 # taxid selections
 # These are the taxon IDs that we will be saving from the NCBI taxonomy.
@@ -96,7 +97,7 @@ inf_D=("PB2" "PB1" "P3" "HE" "NP" "M" "NS")
 
 #---------------------------------------------------------------------------------------------------
 # parse input arguments
-while getopts "hi:f:o:l:w:x:" OPTION
+while getopts "hi:f:o:l:w:x:c:" OPTION
 do
 	case $OPTION in
 		h) usage; exit 1 ;;
@@ -106,6 +107,7 @@ do
 		l) logfile=$OPTARG ;;
 		w) tempdir=$OPTARG ;;
 		x) prefix=$OPTARG ;;
+		c) CMD=$OPTARG ;;
 		?) usage; exit ;;
 	esac
 done
@@ -247,9 +249,12 @@ gawk -F $'\t' \
 	printf("id\troot_taxid\ttype\tsegment\tsubtype\thost\tyear\tstrain\n");
 } {
 	if(NR==FNR) {
-		split($1, h, " ");
+		beginning = gensub(/^>/, "", "g", $1);
+		split(beginning, h, " ");
 		split(h[1], acc, "|");
-		header[acc[4]] = substr(h[1], 2);
+		acc_id = h[1]
+		header[acc[4]] = acc_id;
+		header[acc[1]] = acc_id;
 	} else {
 		if(!($1 in OUTLIERLIST) && $11 == "c" && $3 ~ /^[0-9]+$/ && $4 ~ /^(H[0-9]+)?(N[0-9]+)?$/ && $6 ~ /^[12][0-9]{3}(\/[01][0-9])?(\/[0-3][0-9])?$/){
 			if($8 ~ "Influenza A" && length($4) > 0 || $8 ~ "Influenza B" || $8 ~ "Influenza C" || $8 ~ "Influenza D") {

diff --git a/build_flukraken.sh → src/build_flukraken.sh b/build_flukraken.sh → src/build_flukraken.sh
@@ -110,10 +110,10 @@ logfile="/dev/null"
 tempdir="/tmp"
 prefix=""
 BASE=flukraken-$(date "+%F")
-
+CMD="kraken"
 #---------------------------------------------------------------------------------------------------
 # parse input arguments
-while getopts "hk:w:t:dl:x:" OPTION
+while getopts "hk:w:t:d:l:x:r:c:" OPTION
 do
 	case $OPTION in
 		h) usage; exit 1 ;;
@@ -122,8 +122,9 @@ do
 		t) TAXONOMY=$OPTARG ;;
 		l) logfile=$OPTARG ;;
 		w) tempdir=$OPTARG ;;
-		d) download="true" ;;
+		d) download=$OPTARG ;;
 		x) prefix=$OPTARG ;;
+		c) CMD=$OPTARG ;;
 		?) usage; exit ;;
 	esac
 done
@@ -255,6 +256,7 @@ if [[ "$download" == "true" ]]; then
 		-w "$workdir" \
 		-l "$logfile" \
 		-w "$workdir" \
+		-c "$CMD" \
 		-x " |  "
 
 	TAXONOMY="$BASE/taxonomy"
@@ -273,6 +275,7 @@ if [[ "$download" == "true" ]]; then
 		-o "$BASE/raw/annotation_IVR.dat" \
 		-l "$logfile" \
 		-w "$workdir" \
+		-c "$CMD" \
 		-x " |  "
 fi
 
@@ -288,11 +291,11 @@ build_taxonomy.sh \
 	-1 "$offset1" \
 	-2 "$offset2" \
 	-l "$logfile" \
+	-c "$CMD" \
 	-w "$workdir" \
 	-x " |  "
-
 #===================================================================================================
-# Create Kraken database
+# Create Kraken or Centrifuge database
 #===================================================================================================
 
 #-------------------------------------------------
@@ -303,6 +306,7 @@ build_krakendb.sh \
 	-2 "$offset2" \
 	-l "$logfile" \
 	-w "$workdir" \
+	-c "$CMD" \
 	-x " |  "
 
 #-------------------------------------------------

diff --git a/build_krakendb.sh → src/build_krakendb.sh b/build_krakendb.sh → src/build_krakendb.sh
@@ -52,6 +52,7 @@ usage() {
 	echo -e "OPTIONS:"
 	echo -e "   -h      show this message"
 	echo -e "   -k      kraken database directory"
+	echo -e "   -c      Classifier to use: [kraken, centrifuge]"
 	echo -e "   -r      reference FASTA file"
 	echo -e "   -t      taxonomy (default: ${CYAN}taxonomy sub-folder in kraken database directory${NC})"
 	echo -e "   -2      offset for taxon IDs for new metadata taxonomy levels (default: ${CYAN}2000000000${NC})"
@@ -94,10 +95,10 @@ FTP="ftp://ftp.ncbi.nih.gov"
 logfile="/dev/null"
 tempdir="/tmp"
 prefix=""
-
+CMD="kraken"
 #---------------------------------------------------------------------------------------------------
 # parse input arguments
-while getopts "hk:r:t:2:l:w:x:" OPTION
+while getopts "hk:r:t:2:l:w:x:c:" OPTION
 do
 	case $OPTION in
 		h) usage; exit 1 ;;
@@ -108,6 +109,7 @@ do
 		l) logfile=$OPTARG ;;
 		w) tempdir=$OPTARG ;;
 		x) prefix=$OPTARG ;;
+		c) CMD=$OPTARG ;;
 		?) usage; exit ;;
 	esac
 done
@@ -211,14 +213,26 @@ fix_references.sh \
 	-w "$workdir" \
 	-x "$prefix |  "
 
-#-------------------------------------------------
-# Build kraken database
-echo_log "------ building kraken database ------"
-kraken-build \
-	--build \
-	--db "$BASE" \
-	--threads 1 | while read line; do echo "[$(date +"%F %T")]$prefix |  $line" | tee -a "$logfile"; done
-
+# -------------------------------------------------
+if [[ $CMD == 'kraken' ]]; then
+	# Build kraken database
+	echo_log "------ building kraken database ------"
+	kraken-build \
+		--build \
+		--db "$BASE" \
+		--threads 1 | while read line; do echo "[$(date +"%F %T")]$prefix |  $line" | tee -a "$logfile"; done
+else 
+	echo_log "------ building centrifuge database ------"
+	cat $BASE/taxonomy/names.dmp | \
+		gawk -F '\t[|]\t' 'BEGIN{OFS="\t"}{print $2,$1}'  > $BASE/seqid2taxid.map
+	centrifuge-build \
+		-p 1 \
+		--conversion-table $BASE/seqid2taxid.map \
+		--taxonomy-tree $BASE/taxonomy/nodes.dmp \
+		--name-table $BASE/taxonomy/names.dmp \
+		$REFERENCES \
+		flucentrifuge | while read line; do echo "[$(date +"%F %T")]$prefix |  $line" | tee -a "$logfile"; done
+fi 
 #-------------------------------------------------
 # Process Kraken database
 echo_log "------ processing kraken database ------"
@@ -227,6 +241,7 @@ process_krakendb.sh \
 	-l "$logfile" \
 	-w "$workdir" \
 	-s \
+	-c $CMD \
 	-x "$prefix |  "
 
 #-------------------------------------------------

diff --git a/build_taxonomy.sh → src/build_taxonomy.sh b/build_taxonomy.sh → src/build_taxonomy.sh
@@ -48,6 +48,7 @@ usage() {
 	echo -e ""
 	echo -e "OPTIONS:"
 	echo -e "   -h      show this message"
+	echo -e "   -c      classifier [kraken, centrifuge]"
 	echo -e "   -i      input metadata table"
 	echo -e "             Format specifications:"
 	echo -e "               (to be specified)"
@@ -78,10 +79,10 @@ offset2=2000000000
 logfile="/dev/null"
 tempdir="/tmp"
 prefix=""
-
+CMD="kraken"
 #---------------------------------------------------------------------------------------------------
 # parse input arguments
-while getopts "hi:t:w:1:2:l:x:" OPTION
+while getopts "hi:t:w:1:2:l:x:c:" OPTION
 do
 	case $OPTION in
 		h) usage; exit 1 ;;
@@ -92,6 +93,7 @@ do
 		2) offset2=$OPTARG ;;
 		l) logfile=$OPTARG ;;
 		x) prefix=$OPTARG ;;
+		c) CMD=$OPTARG ;;
 		?) usage; exit ;;
 	esac
 done
@@ -339,7 +341,7 @@ gawk -F $'\t' \
 	if(1==0){ print "Print names and nodes arrays to files"; }
 
 	for(taxid in names) {
-
+		
 		if(taxid+0 > OFFSET1) {
 
 			if(1==0){ print "Print taxid attribute values to names and nodes file"; }
@@ -348,7 +350,7 @@ gawk -F $'\t' \
 			printf("%s\t|\t%s\t|\t%s\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n", taxid, nodes[taxid]["parent"], nodes[taxid]["label"]) >> NODES;
 
 			if(taxid+0 > OFFSET2) {
-				printf("%s\t|\t%s\t|\t\t|\t%s\t|\n", taxid, fasta_header[taxid], "fasta_header") >> NAMES;
+				printf("%s\t|\t%s\t|\t\t|\t%s\t|\n", taxid, fasta_header[taxid], "scientific name") >> NAMES;
 			}
 
 			if(1==0){ print "Print parent attribute values at each level to names file"; }