Skip to content

Commit

Permalink
Merge pull request #4 from jhuapl-bio/prerelease
Browse files Browse the repository at this point in the history
Prerelease
  • Loading branch information
Merritt-Brian authored Aug 12, 2021
2 parents 9440416 + 1074981 commit ad3e6c9
Show file tree
Hide file tree
Showing 19 changed files with 166 additions and 46 deletions.
Binary file added .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
databases/* filter=lfs diff=lfs merge=lfs -text
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,18 @@ jobs:
name: Build and push Docker image for pre-release
runs-on: ubuntu-latest
steps:

- name: Check out the repo
uses: actions/checkout@v2

with:
lfs: true
- name: Push to Docker Hub
uses: docker/build-push-action@v1
with:
name: ${{ secrets.DOCKERHUB_REPOSITORY }}
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
repository: ${{ secrets.DOCKERHUB_REPOSITORY }}
tag_with_ref: true
tag_with_sha: true
tags: prerelease
lfs: true
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

46 changes: 46 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
FROM continuumio/miniconda3:4.9.2

WORKDIR /opt/software

# Make RUN commands use `bash --login` (always source ~/.bashrc on each RUN)
SHELL ["/bin/bash", "--login", "-c"]

# install apt dependencies and update conda
RUN apt-get update && apt-get install git -y \
&& apt-get install -y apt-transport-https ca-certificates wget unzip bzip2 libfontconfig1 \
&& update-ca-certificates \
&& apt-get -qq -y remove curl \
&& apt-get -qq -y autoremove \
&& apt-get autoclean \
&& rm -rf /var/lib/apt/lists/* /var/log/dpkg.log

ENV PATH /opt/conda/bin:$PATH
RUN conda config --set ssl_verify no
COPY ./environment.yml /opt/environment.yml

RUN conda env create -f /opt/environment.yml

COPY databases /opt/databases
WORKDIR /opt/databases
RUN wget http://ccb.jhu.edu/software/kraken/dl/minikraken_20171019_4GB.tgz
RUN tar -xvzf minikraken_20171019_4GB.tgz
RUN mkdir -p /opt/databases && \
mv minikraken_20171013_4GB /opt/databases/minikraken && \
rm minikraken_20171019_4GB.tgz
RUN find /opt/databases -name "*tar.gz" -exec tar -xvzf {} \;
WORKDIR /opt/software/mytax
COPY src /opt/software/mytax
RUN find . -name "*.sh" | while read fn; do ln -s $PWD/$fn /usr/local/bin; done
RUN conda activate mytax && bash process_krakendb.sh -k /opt/databases/minikraken



# RUN conda activate mytax && \
# bash build_flukraken.sh -k flukraken && \
# rm -r flukraken/library flukraken/raw flukraken/database.jdb* && \
# tar c flukraken | gzip -c | tee flukraken.tar.gz && \
# rm -rf flukraken
COPY sunburst /opt/software/mytax/sunburst



4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Clone this repo with:

Symbolically link all shell scripts into your path, for example with:

`find mytax -name "*.sh" | while read fn; do sudo ln -s $PWD/$fn /usr/local/bin; done`
`find mytax/src -name "*.sh" | while read fn; do sudo ln -s $PWD/$fn /usr/local/bin; done`

# Dependencies

Expand All @@ -26,7 +26,7 @@ Symbolically link all shell scripts into your path, for example with:

## Building example

This pipeline is built from a central set of scripts.
This pipeline is built from a central set of scripts located in the `src` directory

Build flu-kraken example with:

Expand Down
3 changes: 3 additions & 0 deletions databases/flukraken.tar.gz
Git LFS file not shown
7 changes: 7 additions & 0 deletions entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash --login
# The --login ensures the bash configuration is loaded,
# enabling Conda.
set +u
source /opt/conda/etc/profile.d/conda.sh
conda activate mytax

19 changes: 19 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: mytax
channels:
- conda-forge
- bioconda
- anaconda
- defaults
dependencies:
- gawk=5.1.0
- gettext=0.19.8.1
- jellyfish=1.1.12
- libcxx=10.0.0
- libffi=3.3
- libiconv=1.16
- make=4.3
- ncurses=6.2
- perl=5.26.2
- readline=8.0
- kraken=1.1
- centrifuge=1.0.3
13 changes: 9 additions & 4 deletions build_IVR_metadata.sh → src/build_IVR_metadata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ usage() {
echo -e ""
echo -e "OPTIONS:"
echo -e " -h show this message"
echo -e " -c classifier [kraken, centrifuge]"
echo -e " -i input IVR metadata file (influenza_na.dat)"
echo -e " -f input IVR FASTA file (influenza.fna)"
echo -e " -o output metadata table for taxonomy creation"
Expand All @@ -80,7 +81,7 @@ logfile="/dev/null"
tempdir="/tmp"
prefix=""
outliers="KT777860" # comma-separated list of known outliers to exclude from classification

CMD="kraken"
#---------------------------------------------------------------------------------------------------
# taxid selections
# These are the taxon IDs that we will be saving from the NCBI taxonomy.
Expand All @@ -96,7 +97,7 @@ inf_D=("PB2" "PB1" "P3" "HE" "NP" "M" "NS")

#---------------------------------------------------------------------------------------------------
# parse input arguments
while getopts "hi:f:o:l:w:x:" OPTION
while getopts "hi:f:o:l:w:x:c:" OPTION
do
case $OPTION in
h) usage; exit 1 ;;
Expand All @@ -106,6 +107,7 @@ do
l) logfile=$OPTARG ;;
w) tempdir=$OPTARG ;;
x) prefix=$OPTARG ;;
c) CMD=$OPTARG ;;
?) usage; exit ;;
esac
done
Expand Down Expand Up @@ -247,9 +249,12 @@ gawk -F $'\t' \
printf("id\troot_taxid\ttype\tsegment\tsubtype\thost\tyear\tstrain\n");
} {
if(NR==FNR) {
split($1, h, " ");
beginning = gensub(/^>/, "", "g", $1);
split(beginning, h, " ");
split(h[1], acc, "|");
header[acc[4]] = substr(h[1], 2);
acc_id = h[1]
header[acc[4]] = acc_id;
header[acc[1]] = acc_id;
} else {
if(!($1 in OUTLIERLIST) && $11 == "c" && $3 ~ /^[0-9]+$/ && $4 ~ /^(H[0-9]+)?(N[0-9]+)?$/ && $6 ~ /^[12][0-9]{3}(\/[01][0-9])?(\/[0-3][0-9])?$/){
if($8 ~ "Influenza A" && length($4) > 0 || $8 ~ "Influenza B" || $8 ~ "Influenza C" || $8 ~ "Influenza D") {
Expand Down
14 changes: 9 additions & 5 deletions build_flukraken.sh → src/build_flukraken.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@ logfile="/dev/null"
tempdir="/tmp"
prefix=""
BASE=flukraken-$(date "+%F")

CMD="kraken"
#---------------------------------------------------------------------------------------------------
# parse input arguments
while getopts "hk:w:t:dl:x:" OPTION
while getopts "hk:w:t:d:l:x:r:c:" OPTION
do
case $OPTION in
h) usage; exit 1 ;;
Expand All @@ -122,8 +122,9 @@ do
t) TAXONOMY=$OPTARG ;;
l) logfile=$OPTARG ;;
w) tempdir=$OPTARG ;;
d) download="true" ;;
d) download=$OPTARG ;;
x) prefix=$OPTARG ;;
c) CMD=$OPTARG ;;
?) usage; exit ;;
esac
done
Expand Down Expand Up @@ -255,6 +256,7 @@ if [[ "$download" == "true" ]]; then
-w "$workdir" \
-l "$logfile" \
-w "$workdir" \
-c "$CMD" \
-x " | "

TAXONOMY="$BASE/taxonomy"
Expand All @@ -273,6 +275,7 @@ if [[ "$download" == "true" ]]; then
-o "$BASE/raw/annotation_IVR.dat" \
-l "$logfile" \
-w "$workdir" \
-c "$CMD" \
-x " | "
fi

Expand All @@ -288,11 +291,11 @@ build_taxonomy.sh \
-1 "$offset1" \
-2 "$offset2" \
-l "$logfile" \
-c "$CMD" \
-w "$workdir" \
-x " | "

#===================================================================================================
# Create Kraken database
# Create Kraken or Centrifuge database
#===================================================================================================

#-------------------------------------------------
Expand All @@ -303,6 +306,7 @@ build_krakendb.sh \
-2 "$offset2" \
-l "$logfile" \
-w "$workdir" \
-c "$CMD" \
-x " | "

#-------------------------------------------------
Expand Down
35 changes: 25 additions & 10 deletions build_krakendb.sh → src/build_krakendb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ usage() {
echo -e "OPTIONS:"
echo -e " -h show this message"
echo -e " -k kraken database directory"
echo -e " -c Classifier to use: [kraken, centrifuge]"
echo -e " -r reference FASTA file"
echo -e " -t taxonomy (default: ${CYAN}taxonomy sub-folder in kraken database directory${NC})"
echo -e " -2 offset for taxon IDs for new metadata taxonomy levels (default: ${CYAN}2000000000${NC})"
Expand Down Expand Up @@ -94,10 +95,10 @@ FTP="ftp://ftp.ncbi.nih.gov"
logfile="/dev/null"
tempdir="/tmp"
prefix=""

CMD="kraken"
#---------------------------------------------------------------------------------------------------
# parse input arguments
while getopts "hk:r:t:2:l:w:x:" OPTION
while getopts "hk:r:t:2:l:w:x:c:" OPTION
do
case $OPTION in
h) usage; exit 1 ;;
Expand All @@ -108,6 +109,7 @@ do
l) logfile=$OPTARG ;;
w) tempdir=$OPTARG ;;
x) prefix=$OPTARG ;;
c) CMD=$OPTARG ;;
?) usage; exit ;;
esac
done
Expand Down Expand Up @@ -211,14 +213,26 @@ fix_references.sh \
-w "$workdir" \
-x "$prefix | "

#-------------------------------------------------
# Build kraken database
echo_log "------ building kraken database ------"
kraken-build \
--build \
--db "$BASE" \
--threads 1 | while read line; do echo "[$(date +"%F %T")]$prefix | $line" | tee -a "$logfile"; done

# -------------------------------------------------
if [[ $CMD == 'kraken' ]]; then
# Build kraken database
echo_log "------ building kraken database ------"
kraken-build \
--build \
--db "$BASE" \
--threads 1 | while read line; do echo "[$(date +"%F %T")]$prefix | $line" | tee -a "$logfile"; done
else
echo_log "------ building centrifuge database ------"
cat $BASE/taxonomy/names.dmp | \
gawk -F '\t[|]\t' 'BEGIN{OFS="\t"}{print $2,$1}' > $BASE/seqid2taxid.map
centrifuge-build \
-p 1 \
--conversion-table $BASE/seqid2taxid.map \
--taxonomy-tree $BASE/taxonomy/nodes.dmp \
--name-table $BASE/taxonomy/names.dmp \
$REFERENCES \
flucentrifuge | while read line; do echo "[$(date +"%F %T")]$prefix | $line" | tee -a "$logfile"; done
fi
#-------------------------------------------------
# Process Kraken database
echo_log "------ processing kraken database ------"
Expand All @@ -227,6 +241,7 @@ process_krakendb.sh \
-l "$logfile" \
-w "$workdir" \
-s \
-c $CMD \
-x "$prefix | "

#-------------------------------------------------
Expand Down
10 changes: 6 additions & 4 deletions build_taxonomy.sh → src/build_taxonomy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ usage() {
echo -e ""
echo -e "OPTIONS:"
echo -e " -h show this message"
echo -e " -c classifier [kraken, centrifuge]"
echo -e " -i input metadata table"
echo -e " Format specifications:"
echo -e " (to be specified)"
Expand Down Expand Up @@ -78,10 +79,10 @@ offset2=2000000000
logfile="/dev/null"
tempdir="/tmp"
prefix=""

CMD="kraken"
#---------------------------------------------------------------------------------------------------
# parse input arguments
while getopts "hi:t:w:1:2:l:x:" OPTION
while getopts "hi:t:w:1:2:l:x:c:" OPTION
do
case $OPTION in
h) usage; exit 1 ;;
Expand All @@ -92,6 +93,7 @@ do
2) offset2=$OPTARG ;;
l) logfile=$OPTARG ;;
x) prefix=$OPTARG ;;
c) CMD=$OPTARG ;;
?) usage; exit ;;
esac
done
Expand Down Expand Up @@ -339,7 +341,7 @@ gawk -F $'\t' \
if(1==0){ print "Print names and nodes arrays to files"; }
for(taxid in names) {
if(taxid+0 > OFFSET1) {
if(1==0){ print "Print taxid attribute values to names and nodes file"; }
Expand All @@ -348,7 +350,7 @@ gawk -F $'\t' \
printf("%s\t|\t%s\t|\t%s\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n", taxid, nodes[taxid]["parent"], nodes[taxid]["label"]) >> NODES;
if(taxid+0 > OFFSET2) {
printf("%s\t|\t%s\t|\t\t|\t%s\t|\n", taxid, fasta_header[taxid], "fasta_header") >> NAMES;
printf("%s\t|\t%s\t|\t\t|\t%s\t|\n", taxid, fasta_header[taxid], "scientific name") >> NAMES;
}
if(1==0){ print "Print parent attribute values at each level to names file"; }
Expand Down
Loading

0 comments on commit ad3e6c9

Please sign in to comment.