Merge pull request #7 from jhuapl-bio/prerelease

V1.1.0 release
jhuapl-bio · Nov 12, 2021 · 58de2c1 · 58de2c1
2 parents 0409040 + 4628839
commit 58de2c1
Show file tree

Hide file tree

Showing 11 changed files with 4,432 additions and 131 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,8 @@
 
+data/*
+databases/*
+test-data/*
+!test-data/BC0*.fastq
+!test-data/sample.fastq
+tmp/*
+taxdump
diff --git a/Dockerfile b/Dockerfile
@@ -6,42 +6,59 @@ WORKDIR /opt/software
 SHELL ["/bin/bash", "--login", "-c"]
 
 # install apt dependencies and update conda
-RUN apt-get update && apt-get install git -y \
+RUN apt-get  --allow-releaseinfo-change  update && apt-get install git -y \
     && apt-get install -y apt-transport-https ca-certificates wget unzip bzip2 libfontconfig1 \
     && update-ca-certificates \
     && apt-get -qq -y remove curl \
+    && apt-get install -y g++ gcc \
     && apt-get -qq -y autoremove \
     && apt-get autoclean \
     && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log 
 
+# Set up conda environment
 ENV PATH /opt/conda/bin:$PATH
 RUN conda config --set ssl_verify no
 COPY ./environment.yml /opt/environment.yml
-
 RUN conda env create -f /opt/environment.yml
 
-COPY databases /opt/databases
+SHELL ["conda", "run", "-n", "mytax", "/bin/bash", "-c"]
+
+
+
+
+# Define databases that are shipped with the image
+# COPY databases /opt/databases
+
 WORKDIR /opt/databases
-RUN wget http://ccb.jhu.edu/software/kraken/dl/minikraken_20171019_4GB.tgz && \
-    tar -xvzf minikraken_20171019_4GB.tgz && \
-    mkdir -p /opt/databases && \
-    mv minikraken_20171013_4GB /opt/databases/minikraken && \
-    rm minikraken_20171019_4GB.tgz
-# RUN find /opt/databases -name "*tar.gz" -exec tar -xvzf {} \;
+
+RUN mkdir -p /opt/databases && \
+    wget ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/old/minikraken2_v2_8GB_201904.tgz -O /opt/databases/minikraken2.tar.gz && \
+    tar -xvzf minikraken2.tar.gz && rm -rf /opt/databases/minikraken2.tar.gz
+
 WORKDIR /opt/software/mytax
-COPY src /opt/software/mytax
+COPY src/*.sh /opt/software/mytax/
 RUN find . -name "*.sh" | while read fn; do ln -s $PWD/$fn /usr/local/bin; done 
-RUN conda activate mytax && bash process_krakendb.sh -k /opt/databases/minikraken
+# # Define Flukraken by building from source with jellyfish 
+RUN source /opt/conda/etc/profile.d/conda.sh && conda activate mytax && \
+    bash /opt/software/mytax/build_flukraken.sh -k /opt/databases/flukraken2
+
+## Get Centrifuge database
+# RUN wget https://genome-idx.s3.amazonaws.com/centrifuge/p_compressed%2Bh%2Bv.tar.gz -O /opt/databases/centrifuge.tgz && \
+#     mkdir -p /opt/databases/centrifuge && tar -xvzf /opt/databases/centrifuge.tgz --directory /opt/databases/centrifuge/
+
 
+# Install Centrifuge
+COPY ./install.sh /opt/install.sh 
+RUN bash /opt/install.sh
 
 
-# RUN conda activate mytax && \
-#     bash build_flukraken.sh -k flukraken && \
-#     rm -r flukraken/library flukraken/raw flukraken/database.jdb* && \
-#     tar c flukraken | gzip -c | tee flukraken.tar.gz && \
-#     rm -rf flukraken
+COPY src/*.py /opt/software/mytax/ 
 
 COPY sunburst /opt/software/mytax/sunburst
+RUN python3 /opt/software/mytax/generate_hierarchy.py -o /taxdump -download --report sample.report
+RUN rm -rf /tmp/*
+WORKDIR /opt/software/mytax/
 
+# ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "mytax", "bash", "-c"]
 
 
diff --git a/README.md b/README.md
@@ -55,6 +55,111 @@ The single script `build_flukraken.sh` functions as an outer wrapper for the inf
 	process_krakendb.sh -> post-processes database for visualization pipeline (not included in this repo yet)
 ```
 
+## Running process script on kraken/kraken2 report and outfiles
+
+### If running from Docker
+
+docker build . -t jhuaplbio/mytax
+
+Unix
+
+`docker container run -it --rm -v $PWD:/data jhuaplbio/mytax bash`
+
+Windows Powershell
+
+`docker container run -it --rm -v $pwd:/data jhuaplbio/mytax bash`
+
+```
+
+# Run the installation script
+
+
+# Activate the env, this will contain kraken2 and centrifuge scripts to build the database if needed as well as kraken2 and centrifuge dependencies
+
+conda activate mytax
+
+## Lets make a sample.fastq from test-data
+
+# Kraken1 and Centrifuge must be compiled from scratch, due to issues with libraries/binaries on compilation and execution 
+
+bash install.sh
+
+
+# first, download ncbi taxdump
+
+python3 src/generate_hierarchy.py -o $PWD/taxdump --report test-data/sample.report   -download 
+rm taxdump.tar.gz
+
+# create the kraken output first, (report and outfile)
+
+## DEPRECATED Kraken 1 
+
+mkdir -p databases/minikraken1
+wget https://ccb.jhu.edu/software/kraken/dl/minikraken_20171019_4GB.tgz -O databases/minikraken1.tgz
+tar -xvzf databases/minikraken1.tgz --directory databases/
+
+export kraken1db=databases/minikraken_20171013_4GB && \
+kraken --db $kraken1db --output test-data/sample.out test-data/sample.fastq && \
+kraken-report --db $kraken1db  test-data/sample.out | tee  test-data/sample.report
+
+
+
+## Also, use kraken 2
+## export kraken2db to env variable
+
+### IF you've made flukraken2 in tmp or....
+export KRAKEN2_DEFAULT_DB="tmp/flukraken2
+
+### IF you have a pre-made minikraken/other kraken db ready 
+export KRAKEN2_DEFAULT_DB="databases/minikraken2_v2_8GB_201904_UPDATE" && \ 
+kraken2  --output test-data/sample.out  --report test-data/sample.report test-data/sample.fastq
+
+
+### if you need minikraken2
+mkdir -p databases/
+wget ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/old/minikraken2_v2_8GB_201904.tgz -O databases/minikraken2.tgz
+tar -xvzf databases/minikraken2.tgz --directory databases/ 
+
+## User Centrifuge 
+
+### Install centrifuge
+
+bash install.sh
+
+### Set up centrifuge env
+
+mkdir -p databases/centrifuge
+wget https://genome-idx.s3.amazonaws.com/centrifuge/p_compressed%2Bh%2Bv.tar.gz -O databases/centrifuge.tgz
+tar -xvzf databases/centrifuge.tgz --directory databases/centrifuge/
+
+
+
+export centrifugedb=databases/  # example
+## run classify 
+## If you need to make a new database, see here: $CONDA_PREFIX/lib/centrifuge/centrifuge-build --taxonomy-tree taxdump/nodes.dmp --name-table taxdump/names.dmp  sample.fastq sample
+
+$CONDA_PREFIX/lib/centrifuge/centrifuge -f -x databases/centrifuge/p_compressed+h+v  -q test-data/sample.fastq  --report test-data/sample.centrifuge.report > test-data/sample.out
+$CONDA_PREFIX/lib/centrifuge/centrifuge-kreport  -x databases/centrifuge/p_compressed+h+v test-data/sample.centrifuge.report > test-data/sample.report
+
+
+
+ # Next, generate the hierarchy json file
+python3 src/generate_hierarchy.py \
+-o $PWD/test-data/sample.fullstring \
+--report test-data/sample.report \
+-taxdump taxdump/nodes.dmp
+
+
+#Get the json for mytax sunburst plot 
+
+bash krakenreport2json.sh -i test-data/sample.fullstring -o test-data/sample.json
+
+
+
+```
+
+The resulting file can then imported into the sunburst plot at `sunburst/index.html` rendered with a simple `http.server` protocol 
+
 # License and copyright
 
 Copyright (c) 2019 Thomas Mehoke

diff --git a/environment.yml b/environment.yml
@@ -2,18 +2,13 @@ name: mytax
 channels:
   - conda-forge
   - bioconda
-  - anaconda
   - defaults
+  - anaconda
+  - r
 dependencies:
   - gawk=5.1.0
-  - gettext=0.19.8.1
-  - jellyfish=1.1.12
-  - libcxx=10.0.0
-  - libffi=3.3
-  - libiconv=1.16
+  - kraken2=2.1.2
+  - kmer-jellyfish=1.1.12
   - make=4.3
-  - ncurses=6.2
-  - perl=5.26.2
-  - readline=8.0
-  - kraken=1.1
-  - centrifuge=1.0.3
+  - pandas=1.3.4
+  - anytree=2.8.0
diff --git a/install.sh b/install.sh
@@ -0,0 +1,23 @@
+
+# conda create -f environment.yaml
+
+
+## Make sure that mytax env is activated first!
+# # Install kraken1
+# rm -rf $CONDA_PREFIX/lib/kraken1
+# git clone https://github.com/DerrickWood/kraken.git $CONDA_PREFIX/lib/kraken1
+# cd $CONDA_PREFIX/lib/kraken1
+# echo $PWD
+# bash install_kraken.sh $CONDA_PREFIX/bin
+
+# find envs -name "*.yml" -exec conda create -f {} \;
+
+#If you have a Mac, make sure Xcode is updated and installed. If windows, use cygwin or install in a docker container
+# try
+    git clone https://github.com/infphilo/centrifuge $CONDA_PREFIX/lib/centrifuge && \
+    make -C $CONDA_PREFIX/lib/centrifuge && \
+    make install -C $CONDA_PREFIX/lib/centrifuge prefix=$CONDA_PREFIX || # catch
+    cp $CONDA_PREFIX/lib/centrifuge/centrifuge-* $CONDA_PREFIX/bin/ && chmod +x $CONDA_PREFIX/bin/centrifuge*
+
+
+