diff --git a/.github/workflows/github_actions.config b/.github/workflows/github_actions.config index 2db6ccd..f739ac4 100644 --- a/.github/workflows/github_actions.config +++ b/.github/workflows/github_actions.config @@ -1,57 +1,111 @@ process { - withName:unicycler{ - cpus = 2 - } - withName:flye{ - cpus = 2 - } - withName:dragonflye{ - cpus = 2 - } - withName:masurca{ - cpus = 2 - } - withName:medaka{ - cpus = 2 - } - withName:nanoplot{ - cpus = 2 - } - withName:miniasm{ - cpus = 2 - } - withName:porechop{ - cpus = 2 - } - withName:rasusa{ - cpus = 2 - } - withName:raven{ - cpus = 2 - } - withName:'trycycler.*'{ - cpus = 2 - } - withName:cluster { - cpus = 2 - } - withName:reconcile{ - cpus = 2 - errorStrategy = 'ignore' - } - withName:msa { - cpus = 2 - } - withName:partition { - cpus = 2 - } - withName:consensus { - cpus = 2 - } - withName:combine { - cpus = 2 - } - withName:unicycler { - cpus = 2 - } + maxForks = 1 + + withName:bandage{ + cpus=2 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:busco{ + cpus=2 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:bwa{ + cpus=3 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:circulocov{ + cpus=3 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:copy{ + cpus=2 + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:dnaapler{ + cpus=2 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:fastp{ + cpus=2 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:flye{ + cpus=3 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:gfastats{ + cpus=2 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:gfa_to_fasta{ + cpus=2 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:medaka{ + cpus=3 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:multiqc{ + cpus=3 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:nanoplot_summary{ + cpus=2 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:nanoplot{ + cpus=2 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:polypolish{ + cpus=2 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:pypolca{ + cpus=2 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:rasusa{ + cpus=2 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:raven{ + cpus=2 + ext.args = ' ' + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'ignore'} + } + withName:summary{ + cpus=2 + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:unicycler{ + cpus=3 + memory = { 6.GB * task.attempt } + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:versions{ + cpus=2 + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } + withName:test{ + cpus=2 + errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'} + } } diff --git a/.github/workflows/github_actions_hybrid_sample_sheet.txt b/.github/workflows/github_actions_hybrid_sample_sheet.txt index f0b6bc1..a954d5a 100644 --- a/.github/workflows/github_actions_hybrid_sample_sheet.txt +++ b/.github/workflows/github_actions_hybrid_sample_sheet.txt @@ -1,2 +1,2 @@ sample,fastq,fastq_1,fastq_2 -test,long_reads_low_depth.fastq.gz,short_reads_1.fastq.gz,short_reads_2.fastq.gz \ No newline at end of file +test,reads/long_reads_low_depth.fastq.gz,reads/short_reads_1.fastq.gz,reads/short_reads_2.fastq.gz \ No newline at end of file diff --git a/.github/workflows/run_workflow_dragonflye.yml b/.github/workflows/run_workflow_dragonflye.yml deleted file mode 100644 index fbc230d..0000000 --- a/.github/workflows/run_workflow_dragonflye.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Test Donut Falls dragonflye assembly - -on: [pull_request, workflow_dispatch] - -jobs: - - test: - runs-on: ubuntu-20.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - lfs: true - - - name: Checkout LFS objects - run: git lfs checkout - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Run Donut Falls - run: | - docker --version - - wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR377/009/ERR3772599/ERR3772599_1.fastq.gz - - mv ERR3772599_1.fastq.gz reads/. - - nextflow run . -profile docker -c .github/workflows/github_actions.config --reads reads --assembler dragonflye - - tree donut_falls \ No newline at end of file diff --git a/.github/workflows/run_workflow_miniasm.yml b/.github/workflows/run_workflow_miniasm.yml deleted file mode 100644 index 363ba72..0000000 --- a/.github/workflows/run_workflow_miniasm.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Test Donut Falls miniasm and minipolish assembly - -on: [pull_request, workflow_dispatch] - -jobs: - - test: - runs-on: ubuntu-20.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - lfs: true - - - name: Checkout LFS objects - run: git lfs checkout - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Run Donut Falls - run: | - docker --version - - nextflow run . -profile docker,test3 -c .github/workflows/github_actions.config - - tree donut_falls \ No newline at end of file diff --git a/.github/workflows/run_workflow_porechop.yml b/.github/workflows/run_workflow_porechop.yml deleted file mode 100644 index 71bee54..0000000 --- a/.github/workflows/run_workflow_porechop.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Test Donut Falls flye assembly with porechop - -on: [pull_request, workflow_dispatch] - -jobs: - - test: - runs-on: ubuntu-20.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - lfs: true - - - name: Checkout LFS objects - run: git lfs checkout - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Run Donut Falls - run: | - docker --version - - nextflow run . -profile docker,test1 -c .github/workflows/github_actions.config - - tree donut_falls \ No newline at end of file diff --git a/.github/workflows/run_workflow_raven_dir.yml b/.github/workflows/run_workflow_raven_dir.yml deleted file mode 100644 index b26e4ad..0000000 --- a/.github/workflows/run_workflow_raven_dir.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Test Donut Falls raven assembly from directory - -on: [pull_request, workflow_dispatch] - -jobs: - - test: - runs-on: ubuntu-20.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - lfs: true - - - name: Checkout LFS objects - run: git lfs checkout - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Run Donut Falls - run: | - docker --version - - mkdir reads - - wget -q https://github.com/nf-core/test-datasets/blob/23f5b889e4736798c8692e9b92810d9a3e37ee97/nanopore/subset15000.fq.gz?raw=true -O reads/nfcore_subset15000.fa.gz - - wget -q https://bridges.monash.edu/ndownloader/files/23754659 -O great_dataset.tar.gz - tar -xvf great_dataset.tar.gz - mv reads.fastq.gz reads/. - - nextflow run . -profile docker -c .github/workflows/github_actions.config --reads reads --assembler raven - - tree donut_falls \ No newline at end of file diff --git a/.github/workflows/run_workflow_raven_polish.yml b/.github/workflows/run_workflow_raven_polish.yml deleted file mode 100644 index 59f8917..0000000 --- a/.github/workflows/run_workflow_raven_polish.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: Test Donut Falls raven assembly with polishing - -on: [pull_request, workflow_dispatch] - -jobs: - - test: - runs-on: ubuntu-20.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - lfs: true - - - name: Checkout LFS objects - run: git lfs checkout - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Run Donut Falls - run: | - docker --version - - # This needs a better dataset. Sorry! - - # nextflow run . -profile docker -c .github/workflows/github_actions.config --sample_sheet .github/workflows/github_actions_hd_hybrid_sample_sheet.txt --assembler raven - - # tree donut_falls \ No newline at end of file diff --git a/.github/workflows/run_workflow_trycycler.yml b/.github/workflows/run_workflow_trycycler.yml deleted file mode 100644 index 6ed515f..0000000 --- a/.github/workflows/run_workflow_trycycler.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Test Donut Falls with trycycler - -on: [pull_request, workflow_dispatch] - -jobs: - - test: - runs-on: ubuntu-20.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - lfs: true - - - name: Checkout LFS objects - run: git lfs checkout - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Run Donut Falls - run: | - docker --version - - nextflow run . -profile docker,test5 -c .github/workflows/github_actions.config - tree donut_falls \ No newline at end of file diff --git a/.github/workflows/run_workflow_unicyclerlr.yml b/.github/workflows/run_workflow_unicyclerlr.yml deleted file mode 100644 index 8c1c8ac..0000000 --- a/.github/workflows/run_workflow_unicyclerlr.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Test Donut Falls unicycler long read only assembly - -on: [pull_request, workflow_dispatch] - -jobs: - - test: - runs-on: ubuntu-20.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - lfs: true - - - name: Checkout LFS objects - run: git lfs checkout - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Run Donut Falls - run: | - docker --version - - nextflow run . -profile docker,test4 -c .github/workflows/github_actions.config - tree donut_falls - \ No newline at end of file diff --git a/.github/workflows/sample_sheet_all.csv b/.github/workflows/sample_sheet_all.csv new file mode 100755 index 0000000..e4c745e --- /dev/null +++ b/.github/workflows/sample_sheet_all.csv @@ -0,0 +1,3 @@ +sample,fastq,fastq_1,fastq_2 +test,test_files/test_nanopore.fastq.gz,test_files/test_illumina_1.fastq.gz,test_files/test_illumina_1.fastq.gz +test2,test_files/test_nanopore_only.fastq.gz,, \ No newline at end of file diff --git a/.github/workflows/sample_sheet_both.csv b/.github/workflows/sample_sheet_both.csv new file mode 100755 index 0000000..00c93ed --- /dev/null +++ b/.github/workflows/sample_sheet_both.csv @@ -0,0 +1,2 @@ +sample,fastq,fastq_1,fastq_2 +test,test_files/test_nanopore.fastq.gz,test_files/test_illumina_1.fastq.gz,test_files/test_illumina_1.fastq.gz \ No newline at end of file diff --git a/.github/workflows/sample_sheet_lr.csv b/.github/workflows/sample_sheet_lr.csv new file mode 100755 index 0000000..331311c --- /dev/null +++ b/.github/workflows/sample_sheet_lr.csv @@ -0,0 +1,3 @@ +sample,fastq +test,test_files/test_nanopore.fastq.gz +test2,test_files/test_nanopore_only.fastq.gz \ No newline at end of file diff --git a/.github/workflows/ss_lr.csv b/.github/workflows/ss_lr.csv new file mode 100755 index 0000000..f966789 --- /dev/null +++ b/.github/workflows/ss_lr.csv @@ -0,0 +1,2 @@ +sample,fastq +test,reads/long_reads_low_depth.fastq.gz \ No newline at end of file diff --git a/.github/workflows/run_workflow_flye.yml b/.github/workflows/test_assembler_flye.yml old mode 100644 new mode 100755 similarity index 64% rename from .github/workflows/run_workflow_flye.yml rename to .github/workflows/test_assembler_flye.yml index 9f835b5..8fd7f33 --- a/.github/workflows/run_workflow_flye.yml +++ b/.github/workflows/test_assembler_flye.yml @@ -8,12 +8,7 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout - uses: actions/checkout@v3 - with: - lfs: true - - - name: Checkout LFS objects - run: git lfs checkout + uses: actions/checkout@v4 - name: Install Nextflow run: | @@ -26,4 +21,10 @@ jobs: nextflow run . -profile docker,test -c .github/workflows/github_actions.config - tree donut_falls \ No newline at end of file + tree donut_falls + + - name: Check files + run: | + ls donut_falls/multiqc/multiqc_report.html + ls donut_falls/summary/flye_summary.tsv + cat donut_falls/summary/donut_falls_summary.json \ No newline at end of file diff --git a/.github/workflows/run_workflow_masurca.yml b/.github/workflows/test_assembler_none.yml old mode 100644 new mode 100755 similarity index 69% rename from .github/workflows/run_workflow_masurca.yml rename to .github/workflows/test_assembler_none.yml index 46accf4..edada46 --- a/.github/workflows/run_workflow_masurca.yml +++ b/.github/workflows/test_assembler_none.yml @@ -1,4 +1,4 @@ -name: Test Donut Falls masurca hybrid assembly +name: Test Donut Falls no assemblers on: [pull_request, workflow_dispatch] @@ -8,27 +8,33 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout - uses: actions/checkout@v3 - with: - lfs: true - - - name: Checkout LFS objects - run: git lfs checkout + uses: actions/checkout@v4 - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ - - name: Run Donut Falls + - name: Download Unicycler test files run: | - docker --version - + mkdir reads + # from unicycler test data wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/short_reads_1.fastq.gz wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/short_reads_2.fastq.gz #wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/long_reads_high_depth.fastq.gz wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/long_reads_low_depth.fastq.gz - nextflow run . -profile docker -c .github/workflows/github_actions.config --sample_sheet .github/workflows/github_actions_hybrid_sample_sheet.txt --assembler masurca + mv *fastq.gz reads/. - tree donut_falls \ No newline at end of file + - name: Run Donut Falls + run: | + docker --version + + nextflow run . -profile docker -c .github/workflows/github_actions.config --sample_sheet .github/workflows/github_actions_hybrid_sample_sheet.txt --assembler doesnt_exist + + tree donut_falls + + - name: Check files + run: | + ls donut_falls/multiqc/multiqc_report.html + cat donut_falls/summary/donut_falls_summary.json \ No newline at end of file diff --git a/.github/workflows/run_workflow_raven.yml b/.github/workflows/test_assembler_raven.yml old mode 100644 new mode 100755 similarity index 58% rename from .github/workflows/run_workflow_raven.yml rename to .github/workflows/test_assembler_raven.yml index afb21e1..f17c362 --- a/.github/workflows/run_workflow_raven.yml +++ b/.github/workflows/test_assembler_raven.yml @@ -8,12 +8,7 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout - uses: actions/checkout@v3 - with: - lfs: true - - - name: Checkout LFS objects - run: git lfs checkout + uses: actions/checkout@v4 - name: Install Nextflow run: | @@ -24,6 +19,11 @@ jobs: run: | docker --version - nextflow run . -profile docker,test2 -c .github/workflows/github_actions.config + nextflow run . -profile docker,test -c .github/workflows/github_actions.config --assembler raven + tree donut_falls - \ No newline at end of file + + - name: Check files + run: | + ls donut_falls/multiqc/multiqc_report.html + cat donut_falls/summary/donut_falls_summary.json \ No newline at end of file diff --git a/.github/workflows/run_workflow_unicycler.yml b/.github/workflows/test_assembler_unicycler.yml old mode 100644 new mode 100755 similarity index 76% rename from .github/workflows/run_workflow_unicycler.yml rename to .github/workflows/test_assembler_unicycler.yml index bef867a..310a6d0 --- a/.github/workflows/run_workflow_unicycler.yml +++ b/.github/workflows/test_assembler_unicycler.yml @@ -1,4 +1,4 @@ -name: Test Donut Falls unicycler hybrid assembly +name: Test Donut Falls unicycler assembly on: [pull_request, workflow_dispatch] @@ -8,22 +8,15 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout - uses: actions/checkout@v3 - with: - lfs: true - - - name: Checkout LFS objects - run: git lfs checkout + uses: actions/checkout@v4 - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ - - name: Run Donut Falls + - name: Download Unicycler test files run: | - docker --version - mkdir reads # from unicycler test data wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/short_reads_1.fastq.gz @@ -31,6 +24,15 @@ jobs: #wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/long_reads_high_depth.fastq.gz wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/long_reads_low_depth.fastq.gz + mv *fastq.gz reads/. + + - name: Run Donut Falls + run: | nextflow run . -profile docker -c .github/workflows/github_actions.config --sample_sheet .github/workflows/github_actions_hybrid_sample_sheet.txt --assembler unicycler + + tree donut_falls - tree donut_falls \ No newline at end of file + - name: Check files + run: | + ls donut_falls/multiqc/multiqc_report.html + cat donut_falls/summary/donut_falls_summary.json \ No newline at end of file diff --git a/.github/workflows/test_sample_sheet_all.yml b/.github/workflows/test_sample_sheet_all.yml new file mode 100755 index 0000000..91816c3 --- /dev/null +++ b/.github/workflows/test_sample_sheet_all.yml @@ -0,0 +1,36 @@ +name: Test Donut Falls sample sheet all + +on: [pull_request, workflow_dispatch] + +jobs: + + test: + runs-on: ubuntu-20.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - name: Get files + run: | + wget --quiet https://zenodo.org/records/10779911/files/df_test_files.tar.gz?download=1 -O dataset.tar.gz + tar -xvf dataset.tar.gz + + cp test_files/test_nanopore.fastq.gz test_files/test_nanopore_only.fastq.gz + + - name: Run Donut Falls + run: | + docker --version + + nextflow run . -profile docker -c .github/workflows/github_actions.config --sample_sheet .github/workflows/sample_sheet_all.csv + + tree donut_falls + + - name: Check files + run: | + ls donut_falls/multiqc/multiqc_report.html + cat donut_falls/summary/donut_falls_summary.json \ No newline at end of file diff --git a/.github/workflows/test_sample_sheet_both_only.yml b/.github/workflows/test_sample_sheet_both_only.yml new file mode 100755 index 0000000..60c04c7 --- /dev/null +++ b/.github/workflows/test_sample_sheet_both_only.yml @@ -0,0 +1,36 @@ +name: Test Donut Falls sample sheet both + +on: [pull_request, workflow_dispatch] + +jobs: + + test: + runs-on: ubuntu-20.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - name: Get files + run: | + wget --quiet https://zenodo.org/records/10779911/files/df_test_files.tar.gz?download=1 -O dataset.tar.gz + tar -xvf dataset.tar.gz + + cp test_files/test_nanopore.fastq.gz test_files/test_nanopore_only.fastq.gz + + - name: Run Donut Falls + run: | + docker --version + + nextflow run . -profile docker -c .github/workflows/github_actions.config --sample_sheet .github/workflows/sample_sheet_both.csv + + tree donut_falls + + - name: Check files + run: | + ls donut_falls/multiqc/multiqc_report.html + cat donut_falls/summary/donut_falls_summary.json \ No newline at end of file diff --git a/.github/workflows/test_sample_sheet_lr_only.yml b/.github/workflows/test_sample_sheet_lr_only.yml new file mode 100755 index 0000000..d2face7 --- /dev/null +++ b/.github/workflows/test_sample_sheet_lr_only.yml @@ -0,0 +1,36 @@ +name: Test Donut Falls sample sheet lr + +on: [pull_request, workflow_dispatch] + +jobs: + + test: + runs-on: ubuntu-20.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - name: Get files + run: | + wget --quiet https://zenodo.org/records/10779911/files/df_test_files.tar.gz?download=1 -O dataset.tar.gz + tar -xvf dataset.tar.gz + + cp test_files/test_nanopore.fastq.gz test_files/test_nanopore_only.fastq.gz + + - name: Run Donut Falls + run: | + docker --version + + nextflow run . -profile docker -c .github/workflows/github_actions.config --sample_sheet .github/workflows/sample_sheet_lr.csv + + tree donut_falls + + - name: Check files + run: | + ls donut_falls/multiqc/multiqc_report.html + cat donut_falls/summary/donut_falls_summary.json \ No newline at end of file diff --git a/.github/workflows/test_ss_lr.yml b/.github/workflows/test_ss_lr.yml new file mode 100755 index 0000000..8195a4b --- /dev/null +++ b/.github/workflows/test_ss_lr.yml @@ -0,0 +1,40 @@ +name: Test Donut Falls sample sheet lr with unicycler assembly + +on: [pull_request, workflow_dispatch] + +jobs: + + test: + runs-on: ubuntu-20.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - name: Download Unicycler test files + run: | + mkdir reads + # from unicycler test data + wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/short_reads_1.fastq.gz + wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/short_reads_2.fastq.gz + #wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/long_reads_high_depth.fastq.gz + wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/long_reads_low_depth.fastq.gz + + mv *fastq.gz reads/. + + - name: Run Donut Falls + run: | + docker --version + + nextflow run . -profile docker -c .github/workflows/github_actions.config --sample_sheet .github/workflows/ss_lr.csv --assembler unicycler,flye + + tree donut_falls + + - name: Check files + run: | + ls donut_falls/multiqc/multiqc_report.html + cat donut_falls/summary/donut_falls_summary.json \ No newline at end of file diff --git a/.github/workflows/versions.yml b/.github/workflows/versions.yml index 1df454b..e912f16 100644 --- a/.github/workflows/versions.yml +++ b/.github/workflows/versions.yml @@ -20,9 +20,10 @@ jobs: - name: versions run: | issue_text="Needing updates: " - staphb_containers=($(grep -h container modules/* | grep staphb | sort | uniq | grep -v latest | awk '{print $2}' | sed 's/'\''//g')) + staphb_containers=($(grep -h container donut_falls.nf | grep staphb | sort | uniq | grep -v latest | awk '{print $2}' | sed 's/'\''//g')) for container in ${staphb_containers[@]} do + echo "Looking at $container" base=$(echo $container | cut -f 1 -d ":") repo_version=$(echo $container | cut -f 2 -d ":") @@ -36,6 +37,7 @@ jobs: echo "New version for $base! Upgrade to $latest_version from $repo_version." | tee -a versions.txt issue_text="$issue_text $base from $repo_version to $latest_version " fi + docker rmi $base:latest done echo $issue_text cat versions.txt diff --git a/README.md b/README.md index 456e0b5..f48ab32 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,6 @@ Donut Falls is a [Nextflow](https://www.nextflow.io/) workflow developed by [@er Donut Falls is also included in the staphb toolkit [staphb-toolkit](https://github.com/StaPH-B/staphb_toolkit). -Donut Falls, admittedly, is just a temporary stop-gap until nf-core's [genomeassembler](https://github.com/nf-core/genomeassembler) is released, so do not be suprised if this repository gets archived. - We made a [wiki](https://github.com/UPHL-BioNGS/Donut_Falls/wiki), please read it! ## Wiki table of contents: @@ -27,17 +25,16 @@ We made a [wiki](https://github.com/UPHL-BioNGS/Donut_Falls/wiki), please read i - [Using config files](https://github.com/UPHL-BioNGS/Donut_Falls/wiki/Usage#using-a-config-file) - [Parameters worth adjusting](https://github.com/UPHL-BioNGS/Donut_Falls/wiki/Usage#recommended-parameters-to-adjust) - [Examples](https://github.com/UPHL-BioNGS/Donut_Falls/wiki/Usage#examples) -- [Subworkflows](https://github.com/UPHL-BioNGS/Donut_Falls/wiki/Subworkflows) + - [Using as a subworkflow](https://github.com/UPHL-BioNGS/Donut_Falls/wiki/Linking) +- [Workflow DAG](https://github.com/UPHL-BioNGS/Donut_Falls/wiki#basic-diagram-of-the-workflow-and-subworkflows) - [FAQ](https://github.com/UPHL-BioNGS/Donut_Falls/wiki/FAQ) - ## Getting started ### Install dependencies - [Nextflow](https://www.nextflow.io/docs/latest/getstarted.html) - [Singularity](https://singularity.lbl.gov/install-linux) or [Docker](https://docs.docker.com/get-docker/) - ## Quick start ``` @@ -63,24 +60,13 @@ sample2,sample2.fastq.gz,, ### Switching assemblers There are currently several options for assembly -- [flye](https://github.com/fenderglass/Flye) -- [dragonflye](https://github.com/rpetit3/dragonflye) -- [miniasm](https://github.com/rrwick/Minipolish) +- [flye](https://github.com/fenderglass/Flye) (default) - [raven](https://github.com/lbcb-sci/raven) - [unicycler](https://github.com/rrwick/Unicycler) (requires short reads for hybrid assembly) -- [lr_unicycler](https://github.com/rrwick/Unicycler) (uses unicycler's nanopore-only option) -- [trycycler](https://github.com/rrwick/Trycycler) - please read [the wiki page](https://github.com/UPHL-BioNGS/Donut_Falls/wiki/Trycycler) -These are specified with the `assembler` paramater. +These are specified with the `assembler` paramater. If Illumina reads are found, then flye and raven assemblies will be polished with those reads. -``` -# nanopore assembly followed by polishing if illumina files are supplied -# assembler is flye, miniasm, raven, or lr_unicycler -nextflow run UPHL-BioNGS/Donut_Falls -profile singularity --sample_sheet --assembler < assembler > - -# hybrid assembly with unicycler where both nanopore and illumina files are required -nextflow run UPHL-BioNGS/Donut_Falls -profile singularity --sample_sheet --assembler unicycler -``` +Note: more than one assembler can be chosen (i.e. `params.assembler = 'flye,raven'`). This will run the input files on each assembler listed. Listing an assembler more than once will not create additional assemblies with that tool (i.e. `params.assembler = 'flye,flye,flye'` will still only run the input files through flye once). ### Reading the sequencing summary file Although not used for anything else, the sequencing summary file can be read in and put through nanoplot to visualize the quality of a sequencing run. This is an optional file and can be set with 'params.sequencing_summary'. @@ -90,22 +76,46 @@ nextflow run UPHL-BioNGS/Donut_Falls -profile singularity --sequencing_summary < * WARNING : Does not work with _older_ versions of the summary file. +## Examples +``` +# nanopore assembly with flye followed by polishing if illumina files are supplied +nextflow run UPHL-BioNGS/Donut_Falls -profile singularity --sample_sheet sample_sheet.csv + +# or with docker and specifying the assembler +nextflow run UPHL-BioNGS/Donut_Falls -profile singularity --sample_sheet sample_sheet.csv --assembler flye + +# hybrid assembly with unicycler where both nanopore and illumina files are required +nextflow run UPHL-BioNGS/Donut_Falls -profile singularity --sample_sheet sample_sheet.csv --assembler unicycler + +# assembling with all three asssemblers +# specifying the results to be stored in 'donut_falls_test_results' instead of 'donut_falls' +# using docker instead of singularity +nextflow run UPHL-BioNGS/Donut_Falls -profile docker --sample_sheet sample_sheet.csv --assembler unicycler,flye,raven + + +# using some test files (requires internet connection) +nextflow run UPHL-BioNGS/Donut_Falls -profile docker --sample_sheet sample_sheet.csv --test + +# same as above +nextflow run UPHL-BioNGS/Donut_Falls -profile docker,test --sample_sheet sample_sheet.csv +``` + ## Credits Donut Falls would not be possible without -- [bgzip](https://github.com/samtools/htslib) : file compression after filtlong +- [bandage](https://github.com/rrwick/Bandage) : visualize gfa files - [busco](https://gitlab.com/ezlab/busco) : assessment of assembly quality -- [circlator](https://github.com/sanger-pathogens/circlator) : rotating circular assembled chromosomes and plasmids -- [dragonflye](https://github.com/rpetit3/dragonflye) : de novo assembly (params.assembler = 'dragonflye') -- [fastp](https://github.com/OpenGene/fastp) : cleaning illuming reads -- [filtlong](https://github.com/rrwick/Filtlong) : prioritizing high quality reads for assembly +- [bwa](https://github.com/lh3/bwa) : aligning reads for polypolish +- [circulocov](https://github.com/erinyoung/CirculoCov) : read depth per contig +- [dnaapler](https://github.com/gbouras13/dnaapler) : rotation +- [fastp](https://github.com/OpenGene/fastp) : cleaning illumina reads (default values) and nanopore reads (minimum length = 1,000 & minimum Q = 12) - [flye](https://github.com/fenderglass/Flye) : de novo assembly (default assembler) - [gfastats](https://github.com/vgl-hub/gfastats) : assessment of assembly -- [masurca](https://github.com/alekseyzimin/masurca) : hybrid assembly options (params.assembler = 'masurca') - [medaka](https://github.com/nanoporetech/medaka) : polishing with nanopore reads -- [miniasm and minipolish](https://github.com/rrwick/Minipolish) : de novo assembly option (params.assembler = 'miniasm') +- [multiqc](https://multiqc.info/) : amalgamation of results - [nanoplot](https://github.com/wdecoster/NanoPlot) : fastq file QC visualization -- [polca](https://github.com/alekseyzimin/masurca) : polishing assemblies with Illumina reads +- [polypolish](https://github.com/rrwick/Polypolish) : reduces sequencing artefacts through polishing with Illumina reads +- [pypolca](https://github.com/gbouras13/pypolca) : reduces sequencing artefacts through polishing with Illumina reads +- [rasusa](https://github.com/mbhall88/rasusa) : subsampling nanopore reads to 150X depth - [raven](https://github.com/lbcb-sci/raven) : de novo assembly option (params.assembler = 'miniasm') -- [trycycler](https://github.com/rrwick/Trycycler) : reconciles different assemblies (params.assembler = 'trycycler') -- [unicycler](https://github.com/rrwick/Unicycler) : hybrid assembly option (params.assembler = 'unicycler') or de novo assembly option (params.assembler = 'lr_unicycler') +- [unicycler](https://github.com/rrwick/Unicycler) : hybrid assembly option (params.assembler = 'unicycler') diff --git a/bin/copy_fasta.py b/bin/copy_fasta.py new file mode 100755 index 0000000..17b5c16 --- /dev/null +++ b/bin/copy_fasta.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +import glob +import json +import csv +import os + +def gfastats_to_dict(header_dict): + dict = {} + with open("gfastats_summary.csv", mode='r', newline='') as file: + reader = csv.DictReader(file) + for row in reader: + if row["sample"] == header_dict['name'] + "_" + header_dict['assembler']: + key = row["Header"] + + dict[key] = row + return dict + +def circulocov_to_dict(header_dict): + dict = {} + with open("circulocov_summary.txt", mode='r', newline='') as file: + reader = csv.DictReader(file, delimiter="\t") + for row in reader: + + if row["sample"].replace("_reoriented","") == header_dict['name'] + "_" + header_dict['assembler'] : + key = row["contigs"] + + dict[key] = row + return dict + +def copy_fasta(fasta, header_dict, gfa_dict, circulocov_dict): + with open(fasta, 'r') as file: + with open(f"consensus/{header_dict['fasta']}", 'w') as outfile: + for line in file: + line = line.strip() + if line.startswith('>'): + contig = line.replace(">","").split()[0] + circular = gfa_dict[contig]['circular'].replace("Y","true").replace("N","false") + length = gfa_dict[contig]['Total segment length'] + gc_per = gfa_dict[contig]['GC content %'] + meandepth = circulocov_dict[contig]['nanopore_meandepth'] + assembler = header_dict['assembler'] + step = header_dict['step'] + outfile.write(f">{contig} circ={circular} len={length} gc={gc_per} cov={meandepth} asmb={assembler} stp={step}\n") + else: + outfile.write(f"{line}\n") + +def main(): + + os.mkdir("consensus") + + header_dict = {} + fasta = glob.glob("*.fasta")[0] + header_dict['fasta'] = fasta + + name = fasta.replace(".fasta", "") + + assemblers = ['dragonflye', 'flye', 'hybracter', 'raven', 'unicycler'] + steps = ["reoriented", 'polypolish', 'pypolca', 'medaka'] + for step in steps: + if step in name: + header_dict['step'] = step + name = name.replace(f"_{step}","") + break + + if 'step' not in header_dict.keys(): + header_dict['step'] = False + + for assembler in assemblers: + if assembler in name: + header_dict['assembler'] = assembler + name = name.replace(f"_{assembler}","") + break + + header_dict['name'] = name + + gfa_dict = gfastats_to_dict(header_dict) + circulocov_dict = circulocov_to_dict(header_dict) + + copy_fasta(fasta, header_dict, gfa_dict, circulocov_dict) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/gfa_to_fasta.py b/bin/gfa_to_fasta.py new file mode 100755 index 0000000..791dff6 --- /dev/null +++ b/bin/gfa_to_fasta.py @@ -0,0 +1,34 @@ +import csv +import glob + +def convert_to_fasta(summary_dict, gfa_file): + outfile = '_'.join(gfa_file.split('.')[:-1]) + ".fasta" + with open(gfa_file, mode='r') as file: + for line in file: + parts = line.split() + if parts and parts[0] == "S": + header = parts[1] + seq = parts[2] + if header in summary_dict.keys(): + new_header = ">" + header + " length=" + summary_dict[header]['Total segment length'] + " circular=" + summary_dict[header]["circular"].replace("N","false").replace("Y","true") + " gc_per=" + summary_dict[header]["GC content %"] + "\n" + with open(outfile, mode='a') as output_file: + output_file.write(new_header) + output_file.write(seq + "\n") + +def read_summary_csv(gfastats_file): + summary_dict = {} + with open(gfastats_file, mode='r', newline='') as file: + reader = csv.DictReader(file) + for row in reader: + key = row['Header'] + summary_dict[key] = row + with open("noncircular.txt", mode='a') as output_file: + if summary_dict[key]["circular"] == "N": + output_file.write(key + "\n") + return summary_dict + +gfastats_file = glob.glob("*_gfastats_summary.csv") +gfa_file = glob.glob("*.gfa") + +summary_dict = read_summary_csv(gfastats_file[0]) +convert_to_fasta(summary_dict, gfa_file[0]) diff --git a/bin/organize_summary.py b/bin/organize_summary.py new file mode 100755 index 0000000..a641942 --- /dev/null +++ b/bin/organize_summary.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 + +import glob +import json +import csv +from os.path import exists + +def file_to_dict(file, header, delim): + dict = {} + with open(file, mode='r', newline='') as file: + reader = csv.DictReader(file, delimiter=delim) + for row in reader: + key = row[header] + dict[key] = row + return dict + +def file_to_dict_uniq(file, header, header2, delim): + dict = {} + with open(file, mode='r', newline='') as file: + reader = csv.DictReader(file, delimiter=delim) + for row in reader: + if row[header] not in dict.keys(): + dict[row[header]] = {} + + key = row[header] + "_" + row[header2] + dict[row[header]][key] = row + return dict + + +def final_file(dict, assemblers): + with open('donut_falls_summary.json', 'w') as json_file: + json.dump(dict, json_file, indent=4) + +def main(): + + if exists('nanoplot_summary.csv') : + nanoplot_dict = file_to_dict('nanoplot_summary.csv', 'sample', ',') + + if exists('pypolca_summary.tsv') : + pypolca_dict = file_to_dict('pypolca_summary.tsv', 'sample', '\t') + + if exists('gfastats_summary.csv') : + gfastats_dict = file_to_dict_uniq('gfastats_summary.csv', 'sample', 'Header', ',') + + busco_dict = {} + busco_files = glob.glob("short_summary*txt") + for file in busco_files: + sample_analysis = file.split(".")[-2] + with open(file, 'r') as f: + for line in f: + if "C:" and "S:" and "D:" and "F:" and "M:" and "n:" in line: + busco_dict[sample_analysis] = line.strip() + break + + circulocov_dict = {} + circulocov_files = glob.glob("*overall_summary.txt") + for file in circulocov_files: + sample_analysis = file.replace("_overall_summary.txt", "").replace("_reoriented", "") + circulocov_dict[sample_analysis] = {} + with open(file, 'r') as f: + for line in f: + parts = line.split() + if parts[2] == "all": + circulocov_dict[sample_analysis]["coverage"] = parts[7] + + if "missing" in line: + if len(parts) > 8: + unmapped_illumina = parts[8] + else: + unmapped_illumina = 0 + + circulocov_dict[sample_analysis]["unmapped_nanopore"] = parts[4] + circulocov_dict[sample_analysis]["unmapped_illumina"] = unmapped_illumina + + final_results = {} + assemblers = ['dragonflye', 'flye', 'hybracter', 'raven', 'unicycler'] + for key in nanoplot_dict.keys(): + final_results[key] = {} + final_results[key]['name'] = key + + # from nanostas + final_results[key]['number_of_reads'] = nanoplot_dict[key]['number_of_reads'] + final_results[key]['mean_read_length'] = nanoplot_dict[key]['mean_read_length'] + final_results[key]['mean_qual'] = nanoplot_dict[key]['mean_qual'] + for assembler in assemblers: + if key + "_" + assembler in gfastats_dict.keys(): + final_results[key][assembler] = {} + + # gfastats results + total_length = 0 + num_circular = 0 + for contig in gfastats_dict[key + "_" + assembler].keys(): + total_length = total_length + int(gfastats_dict[key + "_" + assembler][contig]["Total segment length"]) + if gfastats_dict[key + "_" + assembler][contig]["circular"] == "Y": + num_circular = num_circular + 1 + final_results[key][assembler]['total_length'] = total_length + final_results[key][assembler]['num_contigs'] = len(gfastats_dict[key + "_" + assembler].keys()) + final_results[key][assembler]['circ_contigs'] = num_circular + + # circulocov results + if key + "_" + assembler in circulocov_dict.keys(): + final_results[key][assembler]['coverage'] = circulocov_dict[key + '_' + assembler]['coverage'] + final_results[key][assembler]['unmapped_nanopore'] = circulocov_dict[key + '_' + assembler]['unmapped_nanopore'] + final_results[key][assembler]['unmapped_illumina'] = circulocov_dict[key + '_' + assembler]['unmapped_illumina'] + + # busco results + if key + "_" + assembler in busco_dict.keys(): + final_results[key][assembler]['busco'] = busco_dict[key + "_" + assembler] + if key + "_" + assembler + '_reoriented' in busco_dict.keys(): + final_results[key][assembler]['busco'] = busco_dict[key + "_" + assembler + '_reoriented'] + for step in ['polypolish', 'pypolca', 'medaka']: + if key + "_" + assembler + '_' + step in busco_dict.keys(): + final_results[key][assembler]['busco_' + step ] = busco_dict[key + "_" + assembler + '_' + step] + else: + final_results[key][assembler]['busco_' + step ] = 'NF' + + # pypolca results + if key + "_" + assembler in pypolca_dict.keys(): + final_results[key][assembler]['Consensus_Quality_Before_Polishing'] = pypolca_dict[key + "_" + assembler]['Consensus_Quality_Before_Polishing'] + final_results[key][assembler]['Consensus_QV_Before_Polishing'] = pypolca_dict[key + "_" + assembler]['Consensus_QV_Before_Polishing'] + else: + final_results[key][assembler]['Consensus_Quality_Before_Polishing'] = 0 + final_results[key][assembler]['Consensus_QV_Before_Polishing'] = 0 + + final_file(final_results, assemblers) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/configs/10M.config b/configs/10M.config new file mode 100755 index 0000000..a926dac --- /dev/null +++ b/configs/10M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 10mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/1_5M.config b/configs/1_5M.config new file mode 100755 index 0000000..0e34248 --- /dev/null +++ b/configs/1_5M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 1.5mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/2M.config b/configs/2M.config new file mode 100755 index 0000000..5ecbb22 --- /dev/null +++ b/configs/2M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 2mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/2_5M.config b/configs/2_5M.config new file mode 100755 index 0000000..f16ce5a --- /dev/null +++ b/configs/2_5M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 2.5mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/3M.config b/configs/3M.config new file mode 100755 index 0000000..43b45c8 --- /dev/null +++ b/configs/3M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 3mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/3_5M.config b/configs/3_5M.config new file mode 100755 index 0000000..a9b5d1a --- /dev/null +++ b/configs/3_5M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 3.5mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/4M.config b/configs/4M.config new file mode 100755 index 0000000..04134d0 --- /dev/null +++ b/configs/4M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 4mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/4_5M.config b/configs/4_5M.config new file mode 100755 index 0000000..b1edf3d --- /dev/null +++ b/configs/4_5M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 4.5mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/5_5M.config b/configs/5_5M.config new file mode 100755 index 0000000..2c778f0 --- /dev/null +++ b/configs/5_5M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 5.5mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/6M.config b/configs/6M.config new file mode 100755 index 0000000..c01c978 --- /dev/null +++ b/configs/6M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 6mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/6_5M.config b/configs/6_5M.config new file mode 100755 index 0000000..ae9de57 --- /dev/null +++ b/configs/6_5M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 6.5mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/7M.config b/configs/7M.config new file mode 100755 index 0000000..e74c215 --- /dev/null +++ b/configs/7M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 7mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/7_5M.config b/configs/7_5M.config new file mode 100755 index 0000000..58dc14b --- /dev/null +++ b/configs/7_5M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 7.5mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/8M.config b/configs/8M.config new file mode 100755 index 0000000..03d81ca --- /dev/null +++ b/configs/8M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 8mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/8_5M.config b/configs/8_5M.config new file mode 100755 index 0000000..e164e6f --- /dev/null +++ b/configs/8_5M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 8.5mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/9M.config b/configs/9M.config new file mode 100755 index 0000000..f815fdf --- /dev/null +++ b/configs/9M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 9mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/9_5M.config b/configs/9_5M.config new file mode 100755 index 0000000..4fb6395 --- /dev/null +++ b/configs/9_5M.config @@ -0,0 +1,5 @@ +process { + withName: rasusa { + ext.args = "--genome-size 9.5mb --coverage 150" + } +} \ No newline at end of file diff --git a/configs/donut_falls_template.config b/configs/donut_falls_template.config index 3654ab3..39c1172 100644 --- a/configs/donut_falls_template.config +++ b/configs/donut_falls_template.config @@ -37,43 +37,211 @@ //tower.enabled = true //# Adjustable Workflow paramters --------------------------- -//# specifies assembler to use. Options are 'flye', 'miniasm', 'raven', 'unicycler', 'masurca', and 'lr_unicycler' -//params.assembler = 'flye' +//# specifies input files +//# header = sample,fastq,fastq_1,fastq_2 +//# sample = value for filenames +//# fastq = nanopore fastq file +//# fastq_1 = optional: illumina R1 fastq file +//# fastq_2 = optional: illumina R1 fastq file +//params.sample_sheet = '' + +//# specifies assembler to use. Options are 'flye', 'raven', and 'unicycler' in any combination +//# examples: +//params.assembler = 'flye' +//params.assembler = 'unicycler' +//params.assembler = 'flye,raven' + //# when set to true, creates a copy of this template file for the end user -//params.config_file = false +//params.config_file = false + //# directory where results are placed -//params.outdir = 'donut_falls' -//# directory of ONT reads -//params.reads = '' -//# file for trycycler reconcile for removing sequences from a cluster -//params.remove = 'remove.txt' -//# sample sheet for Donut Falls -//params.sample_sheet = '' -//# sequencing summary file from ONT run -//params.sequencing_summary = workflow.launchDir + "/*sequencing_summary*txt" +//params.outdir = 'donut_falls' + +//# adds test data to workflow +//params.test = false + +//# runs nanoplot on nanopore sequencing summary +//params.sequencing_summary = "" -//# Adjust if additional options in the processes are needed for end user purposes -//params.busco_options = '' -//params.circlator_options = '' -//params.fastp_options = '' -//params.filtlong_options = '--min_length 1000 --keep_percent 95' -//params.flye_options = '' -//params.gfastats_options = '' -//params.masurca_options = '' -//params.medaka_options = '' -//params.multiqc_options = '' -//params.nanoplot_options = '' -//params.nanoplot_summary_options = '' -//params.polca_options = '' -//params.porechop_options = '' -//params.quast_options = '' -//params.rasusa_options = '--frac 80' -//params.raven_options = '--polishing-rounds 2' -//params.trycycler_cluster_options = '' -//params.trycycler_consensus_options = '' -//params.trycycler_dotplot_options = '' -//params.trycycler_msa_options = '' -//params.trycycler_partition_options = '' -//params.trycycler_reconcile_options = '' -//params.trycycler_subsample_options = '' -//params.unicycler_options = '' +//process { +//# final directory +// publishDir = [ path: params.outdir, mode: 'copy' ] +// +//# cpu management +// withLabel: maxcpus { +// cpus = params.maxcpus +// } +// withLabel: medcpus { +// cpus = params.medcpus +// } +// +//# processes +// withName:bandage{ +// label = "process_low" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'quay.io/biocontainers/bandage:0.8.1--hc9558a2_2' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// time = '10m' +// } +// withName:busco{ +// label = "process_medium" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/busco:5.6.1-prok-bacteria_odb10_2024-01-08' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// time = '45m' +// } +// withName:bwa{ +// label = 'process_high' +// // no publishDir +// 'staphb/bwa:0.7.17' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// time = "'2h'" +// } +// withName:circulocov{ +// label = "process_medium" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'quay.io/uphl/circulocov:0.1.20240104-2024-02-21' +// time = '1h' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:copy{ +// label = "process_single" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/multiqc:1.19' +// time = '10m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:dnaapler{ +// label = "process_medium" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/dnaapler:0.7.0' +// time = '1h' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:fastp{ +// label = "process_low" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/fastp:0.23.4' +// time = '10m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:flye{ +// label = "process_high" +// "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/flye:2.9.3' +// time = '10h' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:gfastats{ +// label = "process_medium" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy', pattern: 'gfastats/*' +// container = 'staphb/gfastats:1.3.6' +// time = '10m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:gfa_to_fasta{ +// label = "process_low" +// // no publishDir +// container = 'staphb/multiqc:1.19' +// time = '10m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore' }" +// } +// withName:medaka{ +// label = "process_medium" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'ontresearch/medaka:v1.11.3' +// time = '30m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:multiqc{ +// label = "process_low" +// publishDir = "${params.outdir}", mode: 'copy' +// container = 'staphb/multiqc:1.19' +// time = '10m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:nanoplot_summary{ +// label = "process_low" +// publishDir = "${params.outdir}/summary", mode: 'copy' +// container = 'staphb/nanoplot:1.42.0' +// time = '10m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:nanoplot{ +// label = "process_low" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/nanoplot:1.42.0' +// time = '10m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:ontime{ +// label = "process_medium" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/ontime:0.2.3' +// time = '10m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:polypolish{ +// label = "process_medium" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/polypolish:0.6.0' +// time = '45m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:pypolca{ +// label = "process_medium" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/pypolca:0.3.1' +// time = '30m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:rasusa{ +// label = "process_medium" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/rasusa:0.8.0' +// time = '10m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:raven{ +// label = "process_high" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/raven:1.8.3' +// time = '10h' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:summary{ +// label = "process_single" +// publishDir = "${params.outdir}/summary", mode: 'copy' +// container = 'staphb/multiqc:1.19' +// time = '10m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:unicycler{ +// label = "process_high" +// publishDir = "${params.outdir}/${meta.id}", mode: 'copy' +// container = 'staphb/unicycler:0.5.0' +// time = '10h' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:versions{ +// label = "process_single" +// publishDir = "${params.outdir}/summary", mode: 'copy' +// container = 'staphb/multiqc:1.19' +// time = '10m' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:test_unicycler{ +// label = "process_single" +// publishDir = "${params.outdir}/test_files/unicycler", mode: 'copy' +// container = 'staphb/multiqc:1.19' +// time = '1h' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// withName:test_donut_falls{ +// label = "process_single" +// publishDir = "${params.outdir}/test_files/df", mode: 'copy' +// container = 'staphb/multiqc:1.19' +// time = '1h' +// errorStrategy = "{ task.attempt < 2 ? 'retry' : 'ignore'}" +// } +// } \ No newline at end of file diff --git a/configs/extra.nf b/configs/extra.nf new file mode 100755 index 0000000..4073769 --- /dev/null +++ b/configs/extra.nf @@ -0,0 +1,290 @@ +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +// TODO + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +// process ontime { +// tag "${meta.id}" +// label "process_medium" +// publishDir "${params.outdir}/${meta.id}", mode: 'copy' +// container 'staphb/ontime:0.2.3' +// errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} +// time '45m' +// +// input: +// tuple val(meta), file(reads) +// +// output: +// tuple val(meta), file("bbduk/*_rmphix_R{1,2}.fastq.gz"), emit: fastq +// path "bbduk/*", emit: files +// path "bbduk/*.phix.stats.txt", emit: stats +// path "logs/${task.process}/*.log", emit: log +// path "versions.yml", emit: versions +// +// when: +// task.ext.when == null || task.ext.when +// +// shell: +// def args = task.ext.args ?: '' +// def prefix = task.ext.prefix ?: "${meta.id}" +// """ +// ontime --version +// +// ontime --help +// +// cat <<-END_VERSIONS > versions.yml +// "${task.process}": +// ontime: "\$(ontime --version | awk '{print \$NF}')" +// END_VERSIONS +// +// exit 1 +// """ +// } + + +// someday... +// process dragonflye { +// tag "${meta.id}" +// label "process_high" +// publishDir "${params.outdir}/${meta.id}", mode: 'copy' +// container 'staphb/dragonflye:1.1.2' +// errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} +// time '10h' +// +// input: +// tuple val(meta), file(fastq) +// +// output: +// tuple val(meta), file("dragonflye/*_dragonflye.fasta"), optional: true, emit: fasta +// tuple val(meta), file("dragonflye/*_dragonflye.gfa"), optional: true, emit: gfa +// path "dragonflye/*_assembly_info.tsv", emit: summary +// path "dragonflye/*", emit: everything +// path "versions.yml", emit: versions +// +// when: +// task.ext.when == null || task.ext.when +// +// shell: +// def args = task.ext.args ?: '' +// def prefix = task.ext.prefix ?: "${meta.id}" +// """ +// dragonflye ${args} \ +// --reads ${fastq} \ +// --cpus ${task.cpus} \ +// --outdir dragonflye \ +// --prefix ${prefix} +// +// # renaming final files +// if [ -f "dragonflye/flye-unpolished.gfa" ] ; then cp dragonflye/flye-unpolished.gfa dragonflye/${prefix}_dragonflye.gfa ; fi +// if [ -f "dragonflye/flye.fasta" ] ; then cp dragonflye/flye.fasta dragonflye/${prefix}_dragonflye.fasta ; fi +// +// # getting a summary file +// head -n 1 dragonflye/flye-info.txt | awk '{print "sample\\t" \$0}' > dragonflye/${prefix}_assembly_info.tsv +// tail -n+2 dragonflye/flye-info.txt | awk -v sample=${prefix} '{print sample "\\t" \$0}' >> dragonflye/${prefix}_assembly_info.tsv +// +// cat <<-END_VERSIONS > versions.yml +// "${task.process}": +// dragonflye: \$(dragonflye --version | awk '{print \$NF}' ) +// END_VERSIONS +// """ +// } + +// someday... +// process hybracter { +// tag "${meta.id}" +// label "process_high" +// publishDir "${params.outdir}/${meta.id}", mode: 'copy' +// container 'quay.io/biocontainers/hybracter:0.6.0--pyhdfd78af_0' +// errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} +// time '10h' +// +// input: +// tuple val(meta), file(reads), file(illumina) +// +// output: +// tuple val(meta), file("bbduk/*_rmphix_R{1,2}.fastq.gz"), emit: fasta +// tuple val(meta), file("bbduk/*_rmphix_R{1,2}.fastq.gz"), emit: gfa +// path "versions.yml", emit: versions +// +// when: +// task.ext.when == null || task.ext.when +// +// shell: +// def args = task.ext.args ?: '' +// def prefix = task.ext.prefix ?: "${meta.id}" +// """ +// hybracter -h +// +// hybracter version +// +// exit 1 +// +// cat <<-END_VERSIONS > versions.yml +// "${task.process}": +// hybracter: "\$(hybracter --version | awk '{print \$NF}')" +// END_VERSIONS +// exit 1 +// """ +// } + +// process test_nfcore { +// tag "Downloading subset15000" +// label "process_single" +// publishDir "${params.outdir}/test_files/nfcore", mode: 'copy' +// container 'staphb/multiqc:1.19' +// errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} +// time '1h' + +// output: +// tuple val("nfcore-subset15000"), file("nfcore_subset15000.fa.gz"), emit: fastq + +// when: +// task.ext.when == null || task.ext.when + +// shell: +// """ +// wget -q https://github.com/nf-core/test-datasets/blob/23f5b889e4736798c8692e9b92810d9a3e37ee97/nanopore/subset15000.fq.gz?raw=true -O nfcore_subset15000.fa.gz +// """ +// } + +// process test_great_dataset { +// tag "Downloading the great dataset" +// label "process_single" +// publishDir "${params.outdir}/test_files/great", mode: 'copy' +// container 'staphb/multiqc:1.19' +// errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} +// time '1h' + +// output: +// tuple val("great_dataset"), file("reads.fastq.gz"), emit: fastq + +// when: +// task.ext.when == null || task.ext.when + +// shell: +// """ +// wget -q https://bridges.monash.edu/ndownloader/files/23754659 -O dataset.tar.gz +// tar -xvf dataset.tar.gz + +// exit 1 +// """ +// } + + +// process test_good_dataset { +// tag "Downloading the good dataset" +// label "process_single" +// publishDir "${params.outdir}/test_files/good", mode: 'copy' +// container 'staphb/multiqc:1.19' +// errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} +// time '1h' + +// output: +// tuple val("good_dataset"), file("reads.fastq.gz"), emit: fastq + +// when: +// task.ext.when == null || task.ext.when + +// shell: +// """ +// wget -q https://bridges.monash.edu/ndownloader/files/23754647 -O dataset.tar.gz +// tar -xvf dataset.tar.gz +// """ +// } + +// process test_mediocre_dataset { +// tag "Downloading the mediocre dataset" +// label "process_single" +// publishDir "${params.outdir}/test_files/mediocre", mode: 'copy' +// container 'staphb/multiqc:1.19' +// errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} +// time '1h' + +// output: +// tuple val("mediocre_dataset"), file("reads.fastq.gz"), emit: fastq + +// when: +// task.ext.when == null || task.ext.when + +// shell: +// """ +// wget -q https://bridges.monash.edu/ndownloader/files/23754629 -O dataset.tar.gz +// tar -xvf dataset.tar.gz + +// exit 1 +// """ +// } + +// process test_bad_dataset { +// tag "Downloading the bad dataset" +// label "process_single" +// publishDir "${params.outdir}/test_files/bad", mode: 'copy' +// container 'staphb/multiqc:1.19' +// errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} +// time '1h' + +// output: +// tuple val("bad_dataset"), file("reads.fastq.gz"), emit: fastq + +// when: +// task.ext.when == null || task.ext.when + +// shell: +// """ +// wget -q https://bridges.monash.edu/ndownloader/files/23754623 -O dataset.tar.gz +// tar -xvf dataset.tar.gz + +// exit 1 +// """ +// } + +// process test_unicycler { +// tag "Downloading Unicycler test files" +// label "process_single" +// publishDir "${params.outdir}/test_files/unicycler", mode: 'copy' +// container 'staphb/multiqc:1.19' +// errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} +// time '1h' + +// output: +// tuple val("unicycler"), file("long_reads_low_depth.fastq.gz"), file("short_reads*.fastq.gz"), emit: fastq + +// when: +// task.ext.when == null || task.ext.when + +// shell: +// """ +// wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/short_reads_1.fastq.gz +// wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/short_reads_2.fastq.gz +// wget --quiet https://github.com/rrwick/Unicycler/raw/69e712eb95c4b9f8a46aade467260260a9ce7a91/sample_data/long_reads_low_depth.fastq.gz +// """ +// } + + // in DONUT FALLS WORKFLOW + // hybracter and plassembler are on the to-do list + // if (params.assembler =~ /hybracter/ ) { + // hybracter(ch_nanopore_input.join(ch_illumina_input, by: 0 , remainder: true)) + // + // ch_gfa = ch_gfa.mix(hybracter.out.gfa) + // // no ch_summary + // ch_consensus = ch_consensus.mix(hybracter.out.fasta) + // ch_versions = ch_versions.mix(hybracter.out.versions.first()) + // } + + // if (params.assembler =~ /dragonflye/ ) { + // dragonflye(ch_nanopore_input) + // + // dragonflye.out.summary + // .collectFile( + // storeDir: "${params.outdir}/summary/", + // keepHeader: true, + // sort: { file -> file.text }, + // name: "dragonflye_summary.tsv") + // .set { dragonflye_summary } + // + // ch_gfa = dragonflye.out.gfa + // ch_summary = ch_summary.mix(dragonflye_summary) + // // no ch_consensus + // ch_versions = ch_versions.mix(dragonflye.out.versions.first()) + // } \ No newline at end of file diff --git a/donut_falls.nf b/donut_falls.nf new file mode 100755 index 0000000..8d895e1 --- /dev/null +++ b/donut_falls.nf @@ -0,0 +1,1738 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +// read but ignored most things from +// https://carpentries-incubator.github.io/Pipeline_Training_with_Nextflow/07-Nextflow_Best_Practice/index.html + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +// Greetings! + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +println('') +println(' __ ___ ') +println('| ) _ _ _)_ )_ _ ) ) _ ') +println('|_/ (_) ) ) (_( (_ ( (_( ( ( ( ') +println(' _) ') +println('') + +println('Currently using the Donut Falls workflow for use with nanopore sequencing') +println('Author: Erin Young') +println('email: eriny@utah.gov') +println("Version: ${workflow.manifest.version}") +println('') + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +// Setting default param values + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + + +params.config_file = false +if (params.config_file) { + def src = new File("${workflow.projectDir}/configs/donut_falls_config_template.config") + def dst = new File("${workflow.launchDir}/edit_me.config") + dst << src.text + println("A config file can be found at ${workflow.launchDir}/edit_me.config") + exit 0 +} + +params.sequencing_summary = '' +params.sample_sheet = '' +params.assembler = 'flye' +params.outdir = 'donut_falls' +params.test = '' + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +// Checking params + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +def paramCheck(keys) { + set_keys = [ + "outdir", + "sample_sheet", + "sequencing_summary", + "assembler", + "test", + "config_file"] + + for(key in keys){ + if (key !in set_keys){ + println("FATAL: ${key} isn't a supported param!") + println("Supported params: ${set_keys}") + exit 1 + } + } +} + +paramCheck(params.keySet()) + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +// Input files + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +if (params.sequencing_summary){ + Channel + .fromPath("${params.sequencing_summary}", type:'file') + .view { "Summary File : $it" } + .set { ch_sequencing_summary } +} else { + ch_sequencing_summary = Channel.empty() +} + + +// using a sample sheet with the column header of 'sample,fastq,fastq_1,fastq_2' +// sample = meta.id +// fastq = nanopore fastq file +// fastq_1 = illumina fastq file +// fastq_2 = illumina fastq file + + +if (params.sample_sheet) { + Channel + .fromPath("${params.sample_sheet}", type: "file") + .splitCsv( header: true, sep: ',' ) + .map { it -> + meta = [id:it.sample] + tuple( meta, + "${it.fastq}", + "${it.fastq_1}", + "${it.fastq_2}" ) + } + .set{ ch_input_files } +} else { + ch_input_files = Channel.empty() +} + + +// channel for illumina files (paired-end only) +ch_input_files + .filter { it[2] != it[3] } + .map { it -> tuple (it[0], [file(it[2], checkIfExists: true), file(it[3], checkIfExists: true)])} + .set { ch_illumina_input } + +// channel for nanopore files +ch_input_files + .map { it -> tuple (it[0], file(it[1], checkIfExists: true))} + .set { ch_nanopore_input } + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +// Processes + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +process bandage { + tag "${meta.id}" + label "process_low" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'quay.io/biocontainers/bandage:0.8.1--hc9558a2_2' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10m' + + input: + tuple val(meta), file(gfa) + + output: + path "bandage/*" , emit: files + path "bandage/*.png", emit: png + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${gfa.baseName}" + """ + mkdir -p bandage + + Bandage image ${gfa} bandage/${prefix}.png ${args} + Bandage image ${gfa} bandage/${prefix}.svg ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bandage: \$(Bandage --version | awk '{print \$NF }') + END_VERSIONS + """ +} + +process busco { + tag "${meta.id}" + label "process_medium" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'staphb/busco:5.6.1-prok-bacteria_odb10_2024-01-08' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '45m' + + input: + tuple val(meta), file(fasta) + + output: + path("busco/*/*"), emit: everything + path("busco/*/short_summary*.txt"), optional: true, emit: summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '--offline -l /busco_downloads/lineages/bacteria_odb10' + def prefix = task.ext.prefix ?: "${fasta.baseName}" + """ + busco ${args} \ + -m genome \ + -i ${fasta} \ + -o busco/${prefix} \ + --cpu ${task.cpus} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + busco: \$( busco --version | awk '{print \$NF}' ) + END_VERSIONS + """ +} + +process bwa { + tag "${meta.id}" + label 'process_high' + // no publishDir because the sam files are too big + container 'staphb/bwa:0.7.17' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '2h' + + input: + tuple val(meta), file(fasta), file(fastq) + + output: + tuple val(meta), file(fasta), file("bwa/*_{1,2}.sam"), emit: sam + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${fasta.baseName}" + """ + mkdir -p bwa + + bwa index ${fasta} + bwa mem -t ${task.cpus} -a ${fasta} ${fastq[0]} > bwa/${prefix}_1.sam + bwa mem -t ${task.cpus} -a ${fasta} ${fastq[1]} > bwa/${prefix}_2.sam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(bwa 2>&1 | grep -i version | awk '{print \$NF}') + END_VERSIONS + """ +} + +process circulocov { + tag "${meta.id}" + label "process_medium" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'quay.io/uphl/circulocov:0.1.20240104-2024-02-21' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '1h' + + input: + tuple val(meta), file(fasta), file(nanopore), file(illumina) + + output: + path "circulocov/*overall_summary.txt", emit: summary + tuple val(meta), file("circulocov/*/overall_summary.txt"), emit: results + path "circulocov/*/*", emit: everything + path "circulocov/*/fastq/*", emit: fastq + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '-a' + def prefix = task.ext.prefix ?: "${fasta.baseName}" + def reads = (illumina =~ /input/) ? "" : "--illumina ${illumina.join(' ')}" + """ + mkdir -p circulocov/${prefix} + + circulocov ${args} \ + --threads ${task.cpus} \ + --genome ${fasta} \ + --nanopore ${nanopore} \ + ${reads} \ + --out circulocov/${prefix} \ + --sample ${prefix} + + cp circulocov/${prefix}/overall_summary.txt circulocov/${prefix}_overall_summary.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + circulocov: \$(circulocov -v | awk '{print \$NF}') + END_VERSIONS + """ +} + +process copy { + tag "${meta.id}" + label "process_single" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'staphb/multiqc:1.19' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10m' + + input: + tuple val(meta), file(fasta), file(circulocov), file(gfastats) + + output: + path "consensus/*", emit: fastas + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + #!/usr/bin/env python3 + import glob + import json + import csv + import os + + def gfastats_to_dict(header_dict): + dict = {} + with open("gfastats_summary.csv", mode='r', newline='') as file: + reader = csv.DictReader(file) + for row in reader: + if row["sample"] == header_dict['name'] + "_" + header_dict['assembler']: + key = row["Header"] + + dict[key] = row + return dict + + def circulocov_to_dict(header_dict): + dict = {} + with open("circulocov_summary.txt", mode='r', newline='') as file: + reader = csv.DictReader(file, delimiter="\t") + for row in reader: + if row["sample"].replace("_reoriented","") == header_dict['name'] + "_" + header_dict['assembler'] : + key = row["contigs"] + + dict[key] = row + return dict + + def copy_fasta(fasta, header_dict, gfa_dict, circulocov_dict): + with open(fasta, 'r') as file: + with open(f"consensus/{header_dict['fasta']}", 'w') as outfile: + for line in file: + line = line.strip() + if line.startswith('>'): + contig = line.replace(">","").split()[0] + circular = gfa_dict[contig]['circular'].replace("Y","true").replace("N","false") + length = gfa_dict[contig]['Total segment length'] + gc_per = gfa_dict[contig]['GC content %'] + meandepth = circulocov_dict[contig]['nanopore_meandepth'] + assembler = header_dict['assembler'] + step = header_dict['step'] + outfile.write(f">{contig} circ={circular} len={length} gc={gc_per} cov={meandepth} asmb={assembler} stp={step}\\n") + else: + outfile.write(f"{line}\\n") + + def main(): + os.mkdir("consensus") + header_dict = {} + fasta = glob.glob("*.fasta")[0] + header_dict['fasta'] = fasta + + name = fasta.replace(".fasta", "") + + assemblers = ['dragonflye', 'flye', 'hybracter', 'raven', 'unicycler'] + steps = ["reoriented", 'polypolish', 'pypolca', 'medaka'] + for step in steps: + if step in name: + header_dict['step'] = step + name = name.replace(f"_{step}","") + break + + if 'step' not in header_dict.keys(): + header_dict['step'] = False + + for assembler in assemblers: + if assembler in name: + header_dict['assembler'] = assembler + name = name.replace(f"_{assembler}","") + break + + header_dict['name'] = name + + gfa_dict = gfastats_to_dict(header_dict) + circulocov_dict = circulocov_to_dict(header_dict) + + copy_fasta(fasta, header_dict, gfa_dict, circulocov_dict) + + if __name__ == "__main__": + main() + """ +} + +process dnaapler { + tag "${meta.id}" + label "process_medium" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'staphb/dnaapler:0.7.0' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '1h' + + input: + tuple val(meta), file(fasta), file(ignore) + + output: + tuple val(meta), file("dnaapler/*_reoriented.fasta"), emit: fasta + path "dnaapler/*", emit: files + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${fasta.baseName}" + """ + dnaapler all ${args} \ + --input ${fasta} \ + --prefix ${prefix} \ + --output dnaapler \ + --threads ${task.cpus} \ + --ignore ${ignore} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dnaapler: \$(dnaapler --version | awk '{print \$NF}') + END_VERSIONS + """ +} + +process fastp { + tag "${meta.id}" + label "process_low" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'staphb/fastp:0.23.4' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10m' + + input: + tuple val(meta), file(reads), val(type) + + output: + tuple val(meta), file("fastp/*_fastp*.fastq.gz"), val(type), emit: fastq + path "fastp/*", emit: everything + path "fastp/*_fastp*.json", emit: summary + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def lrargs = task.ext.lrargs ?: '--qualified_quality_phred 12 --length_required 1000' + def prefix = task.ext.prefix ?: "${meta.id}" + if (type == "illumina"){ + """ + mkdir -p fastp + + fastp ${args} \ + --in1 ${reads[0]} \ + --in2 ${reads[1]} \ + --out1 fastp/${prefix}_fastp_sr_R1.fastq.gz \ + --out2 fastp/${prefix}_fastp_sr_R2.fastq.gz \ + -h fastp/${prefix}_fastp_sr.html \ + -j fastp/${prefix}_fastp_sr.json + + passed_filter_reads=\$(grep passed_filter_reads fastp/${prefix}_fastp_sr.json | awk '{print \$NF}' | head -n 1 ) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | awk '{print \$NF}' ) + END_VERSIONS + """ + } else { + """ + mkdir -p fastp + + fastp ${lrargs} \ + --in1 ${reads[0]} \ + --out1 fastp/${prefix}_fastp_lr.fastq.gz \ + -h fastp/${prefix}_fastp_lr.html \ + -j fastp/${prefix}_fastp_lr.json + + passed_filter_reads=\$(grep passed_filter_reads fastp/${prefix}_fastp_sr.json | awk '{print \$NF}' | head -n 1 ) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | awk '{print \$NF}') + END_VERSIONS + """ + } +} + +process flye { + tag "${meta.id}" + label "process_high" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'staphb/flye:2.9.3' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10h' + + input: + tuple val(meta), file(fastq) + + output: + tuple val(meta), file("flye/*_flye.fasta"), emit: fasta, optional: true + tuple val(meta), file("flye/*_flye.gfa"), emit: gfa, optional: true + path "flye/*_assembly_info.tsv", emit: summary + path "flye/*", emit: everything + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p flye + + flye ${args} \ + --nano-raw ${fastq} \ + --threads ${task.cpus} \ + --out-dir flye + + # renaming final files + if [ -f "flye/assembly.fasta" ] ; then cp flye/assembly.fasta flye/${prefix}_flye.fasta ; fi + if [ -f "flye/assembly_graph.gfa" ] ; then cp flye/assembly_graph.gfa flye/${prefix}_flye.gfa ; fi + + # getting a summary file + head -n 1 flye/assembly_info.txt | awk '{print "sample\\t" \$0}' > flye/${prefix}_assembly_info.tsv + tail -n+2 flye/assembly_info.txt | awk -v sample=${prefix} '{print sample "\\t" \$0}' >> flye/${prefix}_assembly_info.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + flye: \$( flye --version | awk '{print \$NF}') + END_VERSIONS + """ +} + +process gfastats { + tag "${meta.id}" + label "process_medium" + publishDir "${params.outdir}/${meta.id}", mode: 'copy', pattern: 'gfastats/*' + container 'staphb/gfastats:1.3.6' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10m' + + input: + tuple val(meta), file(gfa) + + output: + tuple val(meta), file(gfa), file("gfastats/*_gfastats_summary.csv"), emit: stats + path "gfastats/*_gfastats_summary.csv", emit: summary + path "gfastats/*", emit: everything + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${gfa.baseName}" + """ + mkdir -p gfastats + + gfastats \ + ${gfa} \ + ${args} \ + --threads ${task.cpus} \ + --tabular \ + --seq-report \ + > gfastats/${prefix}_gfastats.txt + + head -n 1 gfastats/${prefix}_gfastats.txt | tr "\\t" "," | awk '{print "sample," \$0 "circular" }' > gfastats/${prefix}_gfastats_summary.csv + tail -n+2 gfastats/${prefix}_gfastats.txt | tr "\\t" "," | awk -v sample=${prefix} '{print sample "," \$0 }' >> gfastats/${prefix}_gfastats_summary.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gfastats: \$( gfastats -v | head -n 1 | awk '{print \$NF}') + END_VERSIONS + """ +} + +process gfa_to_fasta { + tag "${meta.id}" + label "process_low" + // no publishDir + container 'staphb/multiqc:1.19' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10m' + + input: + tuple val(meta), file(gfa), file(stats) + + output: + tuple val(meta), file("*fasta"), file("noncircular.txt"), emit: fasta + + when: + task.ext.when == null || task.ext.when + + """ + #!/usr/bin/env python3 + + import csv + import glob + + def convert_to_fasta(summary_dict, gfa_file): + outfile = '_'.join(gfa_file.split('.')[:-1]) + ".fasta" + with open(gfa_file, mode='r') as file: + for line in file: + parts = line.split() + if parts and parts[0] == "S": + header = parts[1] + seq = parts[2] + if header in summary_dict.keys(): + new_header = ">" + header + " length=" + summary_dict[header]['Total segment length'] + " circular=" + summary_dict[header]["circular"].replace("N","false").replace("Y","true") + " gc_per=" + summary_dict[header]["GC content %"] + "\\n" + with open(outfile, mode='a') as output_file: + output_file.write(new_header) + output_file.write(seq + "\\n") + + def read_summary_csv(gfastats_file): + summary_dict = {} + with open(gfastats_file, mode='r', newline='') as file: + reader = csv.DictReader(file) + for row in reader: + key = row['Header'] + summary_dict[key] = row + with open("noncircular.txt", mode='a') as output_file: + if summary_dict[key]["circular"] == "N": + output_file.write(key + "\\n") + return summary_dict + + gfastats_file = glob.glob("*_gfastats_summary.csv") + gfa_file = glob.glob("*.gfa") + + summary_dict = read_summary_csv(gfastats_file[0]) + convert_to_fasta(summary_dict, gfa_file[0]) + """ +} + +// From https://github.com/nanoporetech/medaka +// > It is not recommended to specify a value of --threads greater than 2 for medaka consensus since the compute scaling efficiency is poor beyond this. +// > Note also that medaka consensus may been seen to use resources equivalent to + 4 as an additional 4 threads are used for reading and preparing input data. +process medaka { + tag "${meta.id}" + label "process_medium" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'ontresearch/medaka:v1.11.3' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '30m' + + input: + tuple val(meta), path(fasta), path(fastq) + + output: + tuple val(meta), path("medaka/*_medaka.fasta"), emit: fasta + path "medaka/*", emit: everything + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${fasta.baseName.replaceAll('_reoriented','')}" + """ + mkdir -p medaka + + # someday... + # medaka tools resolve_model --auto_model consensus ${fastq} + + medaka_consensus ${args} \ + -i ${fastq} \ + -d ${fasta} \ + -o medaka \ + -t 1 + + if [ -f "medaka/consensus.fasta" ]; then cp medaka/consensus.fasta medaka/${prefix}_medaka.fasta ; fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + medaka: \$( medaka --version | awk '{print \$NF}') + END_VERSIONS + """ +} + +process multiqc { + tag "combining reports" + label "process_low" + publishDir "${params.outdir}", mode: 'copy' + container 'staphb/multiqc:1.19' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10m' + + input: + file(input) + + output: + path "multiqc/multiqc_report.html", emit: report + path "multiqc/multiqc_data/*", emit: everything + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + """ + if [ -f "pypolca_summary.tsv" ] + then + echo "# plot_type: 'table'" > pypolca_mqc.txt + echo "# section_name: 'pypolca'" >> pypolca_mqc.txt + echo "# description: 'Long read polishing'" >> pypolca_mqc.txt + echo "# pconfig:" >> pypolca_mqc.txt + echo "# namespace: 'Cust Data'" >> pypolca_mqc.txt + echo "# headers:" >> pypolca_mqc.txt + echo "# Substitution_Errors_Found:" >> pypolca_mqc.txt + echo "# title: 'Substitution Errors Found'" >> pypolca_mqc.txt + echo "# description: 'Substitution Errors Found'" >> pypolca_mqc.txt + echo "# Insertion/Deletion_Errors_Found:" >> pypolca_mqc.txt + echo "# title: 'Insertion/Deletion Errors Found'" >> pypolca_mqc.txt + echo "# description: 'Insertion/Deletion Errors Found'" >> pypolca_mqc.txt + echo "# Assembly_Size:" >> pypolca_mqc.txt + echo "# title: 'Assembly Size'" >> pypolca_mqc.txt + echo "# description: 'Assembly Size'" >> pypolca_mqc.txt + echo "# Consensus_Quality_Before_Polishing:" >> pypolca_mqc.txt + echo "# title: 'Consensus Quality Before Polishing'" >> pypolca_mqc.txt + echo "# description: 'Consensus Quality Before Polishing'" >> pypolca_mqc.txt + echo "# Consensus_QV_Before_Polishing:" >> pypolca_mqc.txt + echo "# title: 'Consensus QV Before Polishing'" >> pypolca_mqc.txt + echo "# description: 'Consensus QV Before Polishing'" >> pypolca_mqc.txt + cat pypolca_summary.tsv >> pypolca_mqc.txt + fi + + if [ -f "gfastats_summary.csv" ] + then + echo "# plot_type: 'table'" > gfastats_mqc.csv + echo "# section_name: 'gfastats'" >> gfastats_mqc.csv + echo "# description: 'Metrics for GFA files'" >> gfastats_mqc.csv + echo "# pconfig:" >> gfastats_mqc.csv + echo "# namespace: 'Cust Data'" >> gfastats_mqc.csv + echo "# headers:" >> gfastats_mqc.csv + echo "# sample:" >> gfastats_mqc.csv + echo "# title: 'Sample and analysis'" >> gfastats_mqc.csv + echo "# description: 'Sample and analysis that generated contig'" >> gfastats_mqc.csv + echo "# Header:" >> gfastats_mqc.csv + echo "# title: 'Header'" >> gfastats_mqc.csv + echo "# description: 'Name of contig'" >> gfastats_mqc.csv + echo "# Total segment length:" >> gfastats_mqc.csv + echo "# title: 'Total segment length'" >> gfastats_mqc.csv + echo "# description: 'Total segment length'" >> gfastats_mqc.csv + echo "# A:" >> gfastats_mqc.csv + echo "# title: 'A'" >> gfastats_mqc.csv + echo "# description: 'Number of A'" >> gfastats_mqc.csv + echo "# C:" >> gfastats_mqc.csv + echo "# title: 'C'" >> gfastats_mqc.csv + echo "# description: 'Number of C'" >> gfastats_mqc.csv + echo "# G:" >> gfastats_mqc.csv + echo "# title: 'G'" >> gfastats_mqc.csv + echo "# description: 'Number of G'" >> gfastats_mqc.csv + echo "# T:" >> gfastats_mqc.csv + echo "# title: 'T'" >> gfastats_mqc.csv + echo "# description: 'Number of T'" >> gfastats_mqc.csv + echo "# GC content %:" >> gfastats_mqc.csv + echo "# title: 'GC content %'" >> gfastats_mqc.csv + echo "# description: 'GC content %'" >> gfastats_mqc.csv + echo "# # soft-masked bases:" >> gfastats_mqc.csv + echo "# title: '# soft-masked bases'" >> gfastats_mqc.csv + echo "# description: '# soft-masked bases'" >> gfastats_mqc.csv + echo "# circular:" >> gfastats_mqc.csv + echo "# title: 'circular'" >> gfastats_mqc.csv + echo "# description: 'circular'" >> gfastats_mqc.csv + cat gfastats_summary.csv | awk '{print NR ',' \$0}' >> gfastats_mqc.csv + fi + + if [ -f "flye_summary.tsv" ] + then + echo "# plot_type: 'table'" > flye_mqc.csv + echo "# section_name: 'flye'" >> flye_mqc.csv + echo "# description: 'Assembly Info'" >> flye_mqc.csv + echo "# pconfig:" >> flye_mqc.csv + echo "# namespace: 'Cust Data'" >> flye_mqc.csv + echo "# headers:" >> flye_mqc.csv + echo "# sample:" >> flye_mqc.csv + echo "# title: 'Sample'" >> flye_mqc.csv + echo "# description: 'Sample that generated contig'" >> flye_mqc.csv + echo "# #seq_name:" >> flye_mqc.csv + echo "# title: '#seq_name'" >> flye_mqc.csv + echo "# description: 'Name of contig'" >> flye_mqc.csv + echo "# length:" >> flye_mqc.csv + echo "# title: 'length'" >> flye_mqc.csv + echo "# description: 'length'" >> flye_mqc.csv + echo "# cov.:" >> flye_mqc.csv + echo "# title: 'cov'" >> flye_mqc.csv + echo "# description: 'Coverage'" >> flye_mqc.csv + echo "# circ:" >> flye_mqc.csv + echo "# title: 'circ'" >> flye_mqc.csv + echo "# description: 'Whether contig is circular'" >> flye_mqc.csv + echo "# repeat:" >> flye_mqc.csv + echo "# title: 'repeat'" >> flye_mqc.csv + echo "# description: 'repeat'" >> flye_mqc.csv + echo "# mult.:" >> flye_mqc.csv + echo "# title: 'mult'" >> flye_mqc.csv + echo "# description: 'mult.'" >> flye_mqc.csv + echo "# alt_group:" >> flye_mqc.csv + echo "# title: 'alt_group'" >> flye_mqc.csv + echo "# description: 'alt_group'" >> flye_mqc.csv + echo "# graph_path:" >> flye_mqc.csv + echo "# title: 'graph_path'" >> flye_mqc.csv + echo "# description: 'graph_path'" >> flye_mqc.csv + cat flye_summary.tsv | awk '{print NR '\\t' \$0}' >> flye_mqc.csv + fi + + circulocov_check=\$(ls * | grep overall_summary.txt | head -n 1) + if [ -n "\$circulocov_check" ] + then + illumina_check=\$(grep -h illumina *overall_summary.txt | head -n 1) + if [ -n "\$illumina_check" ] + then + circulocov_summary_header=\$illumina_check + else + circulocov_summary_header=\$(grep -h nanopore_numreads *overall_summary.txt | head -n 1) + fi + + echo \$circulocov_summary_header | awk '{print \$1 "\\t" \$2 "\\t" \$3 "\\t" \$4 "\\t" \$5 "\\t" \$6 "\\t" \$7 "\\t" \$8 "\\t" \$9 "\\t" \$10 "\\t" \$11 "\\t" \$12}' > circulocov_summary.txt + cat *overall_summary.txt | grep -v nanopore_numreads | awk '{print \$1 "\\t" \$2 "\\t" \$3 "\\t" \$4 "\\t" \$5 "\\t" \$6 "\\t" \$7 "\\t" \$8 "\\t" \$9 "\\t" \$10 "\\t" \$11 "\\t" \$12}' >> circulocov_summary.txt + + echo "# plot_type: 'table'" > circulocov_mqc.txt + echo "# section_name: 'CirculoCov'" >> circulocov_mqc.txt + echo "# description: 'Coverage estimates for circular sequences'" >> circulocov_mqc.txt + echo "# pconfig:" >> circulocov_mqc.txt + echo "# namespace: 'Cust Data'" >> circulocov_mqc.txt + echo "# headers:" >> circulocov_mqc.txt + echo "# sample:" >> circulocov_mqc.txt + echo "# title: 'Sample'" >> circulocov_mqc.txt + echo "# description: 'Sample that generated contig'" >> circulocov_mqc.txt + echo "# circ:" >> circulocov_mqc.txt + echo "# title: 'circ'" >> circulocov_mqc.txt + echo "# description: 'Whether contig was circular'" >> circulocov_mqc.txt + echo "# contigs:" >> circulocov_mqc.txt + echo "# title: 'contigs'" >> circulocov_mqc.txt + echo "# description: 'name of contig'" >> circulocov_mqc.txt + echo "# length:" >> circulocov_mqc.txt + echo "# title: 'length'" >> circulocov_mqc.txt + echo "# description: 'length of contig'" >> circulocov_mqc.txt + echo "# nanopore_numreads:" >> circulocov_mqc.txt + echo "# title: 'numreads'" >> circulocov_mqc.txt + echo "# description: 'number of nanopore reads mapping to contig'" >> circulocov_mqc.txt + echo "# nanopore_covbases:" >> circulocov_mqc.txt + echo "# title: 'covbases'" >> circulocov_mqc.txt + echo "# description: 'nanopore covbases of contig'" >> circulocov_mqc.txt + echo "# nanopore_coverage:" >> circulocov_mqc.txt + echo "# title: 'coverage'" >> circulocov_mqc.txt + echo "# description: 'nanopore coverage of contig'" >> circulocov_mqc.txt + echo "# nanopore_meandepth:" >> circulocov_mqc.txt + echo "# title: 'meandepth'" >> circulocov_mqc.txt + echo "# description: 'nanopore meandepth of contig'" >> circulocov_mqc.txt + cat circulocov_summary.txt | awk '{print NR '\\t' \$0}' >> circulocov_mqc.txt + fi + + touch whatever.png + + pngs=\$(ls *png) + for png in \${pngs[@]} + do + new_name=\$(echo \$png | sed 's/.png\$/_mqc.png/g') + cp \$png \$new_name + done + + rm whatever_mqc.png + + multiqc ${args} \ + --outdir multiqc \ + . + """ +} + +process nanoplot_summary { + tag "${summary}" + label "process_low" + publishDir "${params.outdir}/summary", mode: 'copy' + container 'staphb/nanoplot:1.42.0' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10m' + + input: + file(summary) + + output: + path "nanoplot/summary", emit: final_directory + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + """ + mkdir -p nanoplot + + NanoPlot ${args} \ + --summary ${summary} \ + --threads ${task.cpus} \ + --outdir nanoplot \ + --tsv_stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + nanoplot: \$(NanoPlot --version | awk '{print \$NF}')) + END_VERSIONS + + exit 1 + """ +} + +process nanoplot { + tag "${meta.id}" + label "process_low" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'staphb/nanoplot:1.42.0' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10m' + + input: + tuple val(meta), file(fastq) + + output: + path "nanoplot/*", emit: everything + path "nanoplot/${meta.id}_NanoStats.txt", emit: stats + path "nanoplot/${meta.id}_NanoStats.csv", emit: summary + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p nanoplot + + NanoPlot ${args} \ + --fastq ${fastq} \ + --threads ${task.cpus} \ + --tsv_stats \ + --outdir nanoplot + + cp nanoplot/NanoStats.txt nanoplot/${prefix}_NanoStats.txt + + echo "sample,\$( cut -f 1 nanoplot/${prefix}_NanoStats.txt | tr '\\n' ',' )" > nanoplot/${prefix}_NanoStats.csv + echo "${prefix},\$(cut -f 2 nanoplot/${prefix}_NanoStats.txt | tr '\\n' ',' )" >> nanoplot/${prefix}_NanoStats.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + nanoplot: \$(NanoPlot --version | awk '{print \$NF}') + END_VERSIONS + """ +} + +process polypolish { + tag "${meta.id}" + label "process_medium" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'staphb/polypolish:0.6.0' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '45m' + + input: + tuple val(meta), file(fasta), file(sam) + + output: + tuple val(meta), file("polypolish/*_polypolish.fasta"), emit: fasta + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def filarg = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${fasta.baseName.replaceAll('_medaka','')}" + """ + mkdir -p polypolish + + polypolish filter \ + ${filarg} \ + --in1 ${sam[0]} \ + --in2 ${sam[1]} \ + --out1 ${prefix}_filtered_1.sam \ + --out2 ${prefix}_filtered_2.sam + + polypolish polish \ + ${args} \ + ${fasta} \ + ${prefix}_filtered_1.sam \ + ${prefix}_filtered_2.sam \ + > polypolish/${prefix}_polypolish.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + polypolish: \$(polypolish --version | awk '{print \$NF}') + END_VERSIONS + """ +} + +process pypolca { + tag "${meta.id}" + label "process_medium" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'staphb/pypolca:0.3.1' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '30m' + + input: + tuple val(meta), file(fasta), file(fastq) + + output: + tuple val(meta), file("pypolca/*_pypolca.fasta"), optional: true, emit: fasta + path "pypolca/*pypolca_summary.tsv", optional: true, emit: summary + path "pypolca/*/*", emit: everything + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${fasta.baseName.replaceAll('_polypolish','')}" + """ + pypolca run ${args}\ + -a ${fasta} \ + -1 ${fastq[0]} \ + -2 ${fastq[1]} \ + -t ${task.cpus} \ + -o pypolca/${prefix} + + if [ -f "pypolca/${prefix}/pypolca.report" ] + then + cut -f 1 -d : pypolca/${prefix}/pypolca.report | \ + sed 's/ /_/g' | \ + tr "\\n" "\\t" | \ + awk '{print "sample\\t" \$0 }' \ + > pypolca/${prefix}_pypolca_summary.tsv + + cut -f 2 -d : pypolca/${prefix}/pypolca.report | \ + awk '{( \$1 = \$1 ) ; print \$0 }' | \ + sed 's/ /_/g' | \ + tr "\\n" "\\t" | \ + awk '{print "${prefix}\\t" \$0 }' \ + >> pypolca/${prefix}_pypolca_summary.tsv + fi + + if [ -f "pypolca/${prefix}/pypolca_corrected.fasta" ]; then cp pypolca/${prefix}/pypolca_corrected.fasta pypolca/${prefix}_pypolca.fasta ; fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pypolca: \$(pypolca --version | awk '{print \$NF}') + END_VERSIONS + """ +} + +process rasusa { + tag "${meta.id}" + label "process_medium" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'staphb/rasusa:0.8.0' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10m' + + input: + tuple val(meta), file(fastq) + + output: + tuple val(meta), file("rasusa/*.fastq.gz"), emit: fastq + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '--genome-size 5mb --coverage 150' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p rasusa + + rasusa ${args} \ + --input ${fastq} \ + --output rasusa/${prefix}_rasusa.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rasusa: \$(rasusa --version | awk '{print \$NF}' ) + END_VERSIONS + """ +} + +process raven { + tag "${meta.id}" + label "process_high" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'staphb/raven:1.8.3' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10h' + + input: + tuple val(meta), file(fastq) + + output: + tuple val(meta), file("raven/*_raven.fasta"), emit: fasta, optional: true + tuple val(meta), file("raven/*_raven.gfa"), emit: gfa, optional: true + path("raven/*"), emit: everything + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '--polishing-rounds 2' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p raven + + raven ${args} \ + --threads ${task.cpus} \ + --graphical-fragment-assembly raven/${prefix}_raven.gfa \ + ${fastq} \ + > raven/${prefix}_raven.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + raven: \$( raven --version | awk '{print \$NF}' ) + END_VERSIONS + """ +} + +process summary { + tag "Creating summary" + label "process_single" + publishDir "${params.outdir}/summary", mode: 'copy' + container 'staphb/multiqc:1.19' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10m' + + input: + file(input) + + output: + path "donut_falls_summary.json", emit: summary + + when: + task.ext.when == null || task.ext.when + + """ + #!/usr/bin/env python3 + + import glob + import json + import csv + from os.path import exists + + def file_to_dict(file, header, delim): + dict = {} + with open(file, mode='r', newline='') as file: + reader = csv.DictReader(file, delimiter=delim) + for row in reader: + key = row[header] + dict[key] = row + return dict + + def file_to_dict_uniq(file, header, header2, delim): + dict = {} + with open(file, mode='r', newline='') as file: + reader = csv.DictReader(file, delimiter=delim) + for row in reader: + if row[header] not in dict.keys(): + dict[row[header]] = {} + key = row[header] + "_" + row[header2] + dict[row[header]][key] = row + return dict + + def final_file(dict): + with open('donut_falls_summary.json', 'w') as json_file: + json.dump(dict, json_file, indent=4) + + def main(): + if exists('nanoplot_summary.csv') : + nanoplot_dict = file_to_dict('nanoplot_summary.csv', 'sample', ',') + else: + nanoplot_dict = {} + + if exists('pypolca_summary.tsv') : + pypolca_dict = file_to_dict('pypolca_summary.tsv', 'sample', '\t') + else: + pypolca_dict = {} + + if exists('gfastats_summary.csv') : + gfastats_dict = file_to_dict_uniq('gfastats_summary.csv', 'sample', 'Header', ',') + else: + gfastats_dict = {} + + busco_dict = {} + busco_files = glob.glob("short_summary*txt") + for file in busco_files: + sample_analysis = file.split(".")[-2] + with open(file, 'r') as f: + for line in f: + if "C:" and "S:" and "D:" and "F:" and "M:" and "n:" in line: + busco_dict[sample_analysis] = line.strip() + break + + circulocov_dict = {} + circulocov_files = glob.glob("*overall_summary.txt") + for file in circulocov_files: + sample_analysis = file.replace("_overall_summary.txt", "").replace("_reoriented", "") + circulocov_dict[sample_analysis] = {} + with open(file, 'r') as f: + for line in f: + parts = line.split() + if parts[2] == "all": + circulocov_dict[sample_analysis]["coverage"] = parts[7] + + if parts[2] == "missing": + if len(parts) > 8: + unmapped_illumina = parts[8] + else: + unmapped_illumina = 0 + + circulocov_dict[sample_analysis]["unmapped_nanopore"] = parts[4] + circulocov_dict[sample_analysis]["unmapped_illumina"] = unmapped_illumina + + final_results = {} + assemblers = ['dragonflye', 'flye', 'hybracter', 'raven', 'unicycler'] + for key in nanoplot_dict.keys(): + final_results[key] = {} + final_results[key]['name'] = key + + # from nanostas + final_results[key]['number_of_reads'] = nanoplot_dict[key]['number_of_reads'] + final_results[key]['mean_read_length'] = nanoplot_dict[key]['mean_read_length'] + final_results[key]['mean_qual'] = nanoplot_dict[key]['mean_qual'] + for assembler in assemblers: + if key + "_" + assembler in gfastats_dict.keys(): + final_results[key][assembler] = {} + + # gfastats results + total_length = 0 + num_circular = 0 + for contig in gfastats_dict[key + "_" + assembler].keys(): + total_length = total_length + int(gfastats_dict[key + "_" + assembler][contig]["Total segment length"]) + if gfastats_dict[key + "_" + assembler][contig]["circular"] == "Y": + num_circular = num_circular + 1 + + final_results[key][assembler]['total_length'] = total_length + final_results[key][assembler]['num_contigs'] = len(gfastats_dict[key + "_" + assembler].keys()) + final_results[key][assembler]['circ_contigs'] = num_circular + + # circulocov results + if key + "_" + assembler in circulocov_dict.keys(): + final_results[key][assembler]['coverage'] = circulocov_dict[key + '_' + assembler]['coverage'] + final_results[key][assembler]['unmapped_nanopore'] = circulocov_dict[key + '_' + assembler]['unmapped_nanopore'] + final_results[key][assembler]['unmapped_illumina'] = circulocov_dict[key + '_' + assembler]['unmapped_illumina'] + + # busco results + if key + "_" + assembler in busco_dict.keys(): + final_results[key][assembler]['busco'] = busco_dict[key + "_" + assembler] + if key + "_" + assembler + '_reoriented' in busco_dict.keys(): + final_results[key][assembler]['busco'] = busco_dict[key + "_" + assembler + '_reoriented'] + for step in ['polypolish', 'pypolca', 'medaka']: + if key + "_" + assembler + '_' + step in busco_dict.keys(): + final_results[key][assembler]['busco_' + step ] = busco_dict[key + "_" + assembler + '_' + step] + else: + final_results[key][assembler]['busco_' + step ] = 'NF' + + # pypolca results + if key + "_" + assembler in pypolca_dict.keys(): + final_results[key][assembler]['Consensus_Quality_Before_Polishing'] = pypolca_dict[key + "_" + assembler]['Consensus_Quality_Before_Polishing'] + final_results[key][assembler]['Consensus_QV_Before_Polishing'] = pypolca_dict[key + "_" + assembler]['Consensus_QV_Before_Polishing'] + else: + final_results[key][assembler]['Consensus_Quality_Before_Polishing'] = 0 + final_results[key][assembler]['Consensus_QV_Before_Polishing'] = 0 + + final_file(final_results) + + if __name__ == "__main__": + main() + + """ +} + +process unicycler { + tag "${meta.id}" + label "process_high" + publishDir "${params.outdir}/${meta.id}", mode: 'copy' + container 'staphb/unicycler:0.5.0' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '10h' + + input: + tuple val(meta), file(illumina), file(nanopore) + + output: + tuple val(meta), file("unicycler/*_unicycler.fasta"), emit: fasta, optional: true + tuple val(meta), file("unicycler/*_unicycler.gfa"), emit: gfa, optional: true + path "unicycler/*", emit: everything + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p unicycler + + unicycler ${args} \ + -1 ${illumina[0]} \ + -2 ${illumina[1]} \ + -l ${nanopore} \ + -o unicycler/ \ + -t ${task.cpus} + + if [ -f "unicycler/assembly.fasta" ] ; then cp unicycler/assembly.fasta unicycler/${prefix}_unicycler.fasta ; fi + if [ -f "unicycler/assembly.gfa" ] ; then cp unicycler/assembly.gfa unicycler/${prefix}_unicycler.gfa ; fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + unicycler: \$(unicycler --version | awk '{print \$NF }' ) + END_VERSIONS + """ +} + +process versions { + tag "extracting versions" + label "process_single" + publishDir "${params.outdir}/summary", mode: 'copy' + container 'staphb/multiqc:1.19' + time '10m' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + + input: + file(input) + + output: + path "software_versions_mqc.yml", emit: versions + path "software_versions.yml", emit: yml + + when: + task.ext.when == null || task.ext.when + + """ + #!/usr/bin/env python3 + + # Stolen and modified from + # https://github.com/nf-core/rnaseq/blob/b89fac32650aacc86fcda9ee77e00612a1d77066/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py#L4 + + import yaml + from textwrap import dedent + + def _make_versions_html(versions): + + html = [ + dedent( + \"\"\"\\ + + + + + + + + + + \"\"\" + ) + ] + for process, tmp_versions in sorted(versions.items()): + html.append("") + for i, (tool, version) in enumerate(sorted(tmp_versions.items())): + html.append( + dedent( + f\"\"\"\\ + + + + + + \"\"\" + ) + ) + html.append("") + html.append("
Process Name Software Version
{process if (i == 0) else ''}{tool}{version}
") + return "\\n".join(html) + + def main(): + + with open("versions.yml") as f: + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) + + versions_by_module = {} + for process, process_versions in versions_by_process.items(): + module = process.split(":")[-1] + try: + if versions_by_module[module] != process_versions: + raise AssertionError( + "There's something wrong with the designated containers of this workflow" + ) + except KeyError: + versions_by_module[module] = process_versions + + versions_mqc = { + "id": "software_versions", + "section_name": "Donut Falls Software Versions", + "section_href": "https://github.com/UPHL-BioNGS/Donut_Falls", + "plot_type": "html", + "description": "Collected at run time from the software output.", + "data": _make_versions_html(versions_by_module), + } + + with open("software_versions.yml", "w") as f: + yaml.dump(versions_by_module, f, default_flow_style=False) + + with open("software_versions_mqc.yml", "w") as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + if __name__ == "__main__": + main() + + """ +} + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +// Downloading files for testing + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +process test { + tag "Downloading R10.4 reads" + label "process_single" + publishDir "${params.outdir}/test_files/df", mode: 'copy' + container 'staphb/multiqc:1.19' + errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} + time '1h' + + output: + tuple val("df"), file("test_files/test_nanopore.fastq.gz"), file("test_files/test_illumina_{1,2}.fastq.gz"), emit: fastq + + when: + task.ext.when == null || task.ext.when + + shell: + """ + wget --quiet https://zenodo.org/records/10779911/files/df_test_files.tar.gz?download=1 -O dataset.tar.gz + tar -xvf dataset.tar.gz + """ +} + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +// Donut Falls + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +workflow DONUT_FALLS { + take: + ch_nanopore_input + ch_illumina_input + + main: + // channel for gfa files for gfa stats + ch_gfa = Channel.empty() + // channel for files for multiqc or workflow summary + ch_summary = Channel.empty() + // channel for assembled genomes + ch_consensus = Channel.empty() + // versions channel + ch_versions = Channel.empty() + + if (params.assembler =~ /unicycler/ ) { + unicycler(ch_illumina_input.join(ch_nanopore_input, by: 0, remainder: false)) + + ch_gfa = ch_gfa.mix(unicycler.out.gfa) + // no ch_summary + ch_consensus = ch_consensus.mix(unicycler.out.fasta) + ch_versions = ch_versions.mix(unicycler.out.versions.first()) + } + + if (params.assembler.replaceAll('dragonflye','dragon') =~ /flye/ || params.assembler =~ /raven/ ) { + // quality filter + ch_illumina_input + .map { it -> [it[0], it[1], "illumina"]} + .mix(ch_nanopore_input.map { it -> [it[0], it[1], "nanopore"]}) + .filter{it[0]} + .set { ch_input } + + fastp(ch_input) + + ch_versions = ch_versions.mix(fastp.out.versions) + ch_summary = ch_summary.mix(fastp.out.summary) + + fastp.out.fastq + .filter { it[1].size() > 200 } + .branch { it -> + nanopore: it[2] == 'nanopore' + illumina: it[2] == 'illumina' + } + .set { ch_filter } + + rasusa(ch_filter.nanopore.map {it -> tuple(it[0], it[1])}) + + ch_versions = ch_versions.mix(rasusa.out.versions) + + if (params.assembler =~ /raven/ ) { + raven(rasusa.out.fastq) + + ch_gfa = ch_gfa.mix(raven.out.gfa) + // no ch_summary + // no ch_consensus + ch_versions = ch_versions.mix(raven.out.versions.first()) + } + + if (params.assembler =~ /flye/ ) { + flye(rasusa.out.fastq) + + flye.out.summary + .collectFile( + storeDir: "${params.outdir}/summary/", + keepHeader: true, + sort: { file -> file.text }, + name: "flye_summary.tsv") + .set { flye_summary } + + ch_gfa = ch_gfa.mix(flye.out.gfa) + ch_summary = ch_summary.mix(flye_summary) + // no ch_consensus + ch_versions = ch_versions.mix(flye.out.versions.first()) + } + } + + bandage(ch_gfa) + gfastats(ch_gfa) + + gfastats.out.summary + .collectFile( + storeDir: "${params.outdir}/summary/", + keepHeader: true, + sort: { file -> file.text }, + name: "gfastats_summary.csv") + .set { gfastats_summary } + + ch_versions = ch_versions.mix(bandage.out.versions).mix(gfastats.out.versions) + ch_summary = ch_summary.mix(gfastats_summary).mix(bandage.out.png) + + if (params.assembler.replaceAll('dragonflye','dragon') =~ /flye/ || params.assembler =~ /raven/ ) { + gfa_to_fasta(gfastats.out.stats.filter { it -> !(it[1] =~ /unicycler/ )} ) + + dnaapler(gfa_to_fasta.out.fasta) + + ch_versions = ch_versions.mix(dnaapler.out.versions) + + dnaapler.out.fasta + .branch { + dragonflye: it =~ /dragonflye/ + raven: it =~ /raven/ + flye: it =~ /flye/ + } + .set { ch_dnaapler_out } + + ch_dnaapler_out.flye + .join(ch_nanopore_input, by:0, remainder: false) + .mix(ch_dnaapler_out.raven.join(ch_nanopore_input, by:0, remainder: false)) + .set { ch_reoriented } + + medaka(ch_reoriented) + + ch_versions = ch_versions.mix(medaka.out.versions) + + medaka.out.fasta + .branch { + dragonflye: it =~ /dragonflye/ + raven: it =~ /raven/ + flye: it =~ /flye/ + } + .set { ch_medaka_out } + + ch_medaka_out.flye + .join(ch_illumina_input, by:0, remainder: false) + .mix(ch_medaka_out.raven.join(ch_illumina_input, by:0, remainder: false)) + .set { ch_medaka_polished } + + bwa(ch_medaka_polished) + polypolish(bwa.out.sam) + + ch_versions = ch_versions.mix(bwa.out.versions).mix(polypolish.out.versions) + + polypolish.out.fasta + .branch { + dragonflye: it =~ /dragonflye/ + raven: it =~ /raven/ + flye: it =~ /flye/ + } + .set { ch_polypolish_out } + + ch_polypolish_out.flye + .join(ch_illumina_input, by:0, remainder: false) + .mix(ch_polypolish_out.raven.join(ch_illumina_input, by:0, remainder: false)) + .set { ch_polypolish_polished } + + pypolca(ch_polypolish_polished) + + pypolca.out.summary + .collectFile(name: "pypolca_summary.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/summary") + .set { pypolca_summary } + + ch_summary = ch_summary.mix(pypolca_summary) + ch_versions = ch_versions.mix(pypolca.out.versions) + + ch_consensus = ch_consensus.mix(dnaapler.out.fasta).mix(medaka.out.fasta).mix(polypolish.out.fasta).mix(pypolca.out.fasta) + } + + nanoplot(ch_nanopore_input) + + nanoplot.out.summary + .collectFile(name: "nanoplot_summary.csv", + keepHeader: true, + storeDir: "${params.outdir}/summary") + .set { nanostats_summary } + + ch_summary = ch_summary.mix(nanostats_summary).mix(nanoplot.out.stats) + ch_versions = ch_versions.mix(nanoplot.out.versions) + + busco(ch_consensus) + + ch_summary = ch_summary.mix(busco.out.summary) + ch_versions = ch_versions.mix(busco.out.versions) + + ch_consensus + .filter{ it -> !(it[1] =~ /pypolca/ )} + .filter{ it -> !(it[1] =~ /medaka/ )} + .filter{ it -> !(it[1] =~ /poylpolish/ )} + .branch { + dragonflye: it =~ /dragonflye/ + raven: it =~ /raven/ + flye: it =~ /flye/ + unicycler: it =~ /unicycler/ + } + .set { ch_assemblies } + + ch_assemblies.dragonflye + .join(ch_nanopore_input, by: 0 , remainder: false).join(ch_illumina_input, by: 0, remainder: true) + .mix(ch_assemblies.flye.join(ch_nanopore_input, by: 0 , remainder: false).join(ch_illumina_input, by: 0, remainder: true)) + .mix(ch_assemblies.unicycler.join(ch_nanopore_input, by: 0 , remainder: false).join(ch_illumina_input, by: 0, remainder: true)) + .mix(ch_assemblies.raven.join(ch_nanopore_input, by: 0 , remainder: false).join(ch_illumina_input, by: 0, remainder: true)) + .filter{ it -> if (it) {it[1]}} + .set{ch_assembly_reads} + + circulocov(ch_assembly_reads) + + circulocov.out.summary + .collectFile(name: "circulocov_summary.txt", + keepHeader: true, + storeDir: "${params.outdir}/summary") + .set { circulocov_summary } + + ch_versions = ch_versions.mix(circulocov.out.versions) + ch_summary = ch_summary.mix(circulocov.out.summary) + + ch_versions + .collectFile( + keepHeader: false, + name: "versions.yml") + .set { ch_collated_versions } + + versions(ch_collated_versions) + ch_summary = ch_summary.mix(versions.out.versions) + + summary(ch_summary.unique().collect()) + + multiqc(ch_summary.unique().collect()) + + ch_consensus + .combine(circulocov_summary) + .combine(gfastats_summary) + .set { ch_fasta_info } + + copy(ch_fasta_info) + emit: + fasta = ch_consensus +} + + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +// Workflow + +// ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### + +workflow { + + if (params.test) { + + test() + + test.out.fastq + .map { it -> + meta = [id:it[0]] + tuple( meta, + file("${it[1]}", checkIfExists: true), + [file("${it[2][0]}", checkIfExists: true), file("${it[2][1]}", checkIfExists: true)]) + } + .set{ ch_test_out } + + ch_test_out + .map{it -> tuple(it[0], it[1])} + .set { ch_test_nanopore } + + ch_test_out + .filter{ it[2] } + .map{it -> tuple(it[0], it[2])} + .set { ch_test_illumina } + + ch_nanopore_input = ch_nanopore_input.mix(ch_test_nanopore) + ch_illumina_input = ch_illumina_input.mix(ch_test_illumina) + } + + if (params.sequencing_summary) { + nanoplot_summary(ch_nanoplot_summary) + } + + DONUT_FALLS(ch_nanopore_input, ch_illumina_input.ifEmpty([])) +} + +workflow.onComplete { + println("Pipeline completed at: $workflow.complete") + println("The multiqc report can be found at ${params.outdir}/multiqc/multiqc_report.html") + println("The consensus fasta files can be found in ${params.outdir}/consensus") + println("The fasta files are from each phase of assembly. polca > polypolish > medaka > unpolished") + println("Execution status: ${ workflow.success ? 'OK' : 'failed' }") +} \ No newline at end of file diff --git a/main.nf b/main.nf deleted file mode 100644 index 40c024a..0000000 --- a/main.nf +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl = 2 - - -println("Currently using the Donut Falls workflow for use with nanopore sequencing\n") -println("Author: Erin Young") -println("email: eriny@utah.gov") -println("Version: ${workflow.manifest.version}") -println("") - -params.config_file = false -if (params.config_file) { - def src = new File("${workflow.projectDir}/configs/donut_falls_config_template.config") - def dst = new File("${workflow.launchDir}/edit_me.config") - dst << src.text - println("A config file can be found at ${workflow.launchDir}/edit_me.config") - exit 0 -} - -params.sequencing_summary = workflow.launchDir + "/*sequencing_summary*txt" -params.sample_sheet = '' -params.assembler = 'flye' -params.outdir = 'donut_falls' -params.remove = 'remove.txt' -params.reads = '' -params.test_wf = false - -params.bandage_options = '' -params.busco_options = '' -params.circlator_options = '' -params.dragonflye_options = '' -params.enable_porechop = false -params.fastp_options = '' -params.filtlong_options = '--min_length 1000 --keep_percent 95' -params.flye_options = '' -params.gfastats_options = '' -params.masurca_options = '' -params.medaka_options = '' -params.multiqc_options = '' -params.nanoplot_summary_options = '' -params.nanoplot_options = '' -params.polca_options = '' -params.polypolish_options = '' -params.porechop_options = '' -params.quast_options = '' -params.rasusa_options = '--frac 80' -params.raven_options = '--polishing-rounds 2' -params.trycycler_subsample_options = '' -params.trycycler_cluster_options = '' -params.trycycler_consensus_options = '' -params.trycycler_dotplot_options = '' -params.trycycler_min_fasta = 6 -params.trycycler_msa_options = '' -params.trycycler_partition_options = '' -params.trycycler_reconcile_options = '' -params.unicycler_options = '' - -include { assembly } from './workflows/assembly' addParams(params) -include { copy } from './modules/copy' addParams(params) -include { filter } from './workflows/filter' addParams(params) -include { hybrid } from './workflows/hybrid' addParams(params) -include { nanoplot_summary as nanoplot } from './modules/nanoplot' addParams(params) -include { metrics } from './workflows/metrics' addParams(params) -include { polish } from './workflows/polish' addParams(params) -include { test } from './workflows/test' addParams(params) -include { trycycler } from './workflows/trycycler' addParams(params) - -Channel - .fromPath(params.sequencing_summary, type:'file') - .view { "Summary File : $it" } - .set { ch_sequencing_summary } - -if ( params.test_wf ) { - println("Let's test this!\n") - ch_input_files = Channel.empty() -} else if ( params.sample_sheet) { - Channel - .fromPath("${params.sample_sheet}", type: "file") - .splitCsv( header: true, sep: ',' ) - .map { it -> tuple( "${it.sample}", "${it.fastq}", "${it.fastq_1}", "${it.fastq_2}" ) } - .branch { it -> - sr: it[2] != it[3] - other: true - } - .set{ch_precheck} - - ch_precheck.sr.map { it -> tuple(it[0], file(it[1]), [file(it[2]), file(it[3])])} - .mix(ch_precheck.other.map { it -> tuple(it[0], file(it[1]), null)}) - .set{ch_input_files} - -} else if ( params.reads ) { - Channel - .fromPath("${params.reads}/*.{fastq,fastq.gz,fq,fq.gz}", type:'file') - .ifEmpty { - println("Could not find fastq files for nanopore sequencing. Set with 'params.reads'") - exit 1 - } - .map { reads -> tuple(reads.simpleName, reads, null ) } - .view { "Fastq file found : ${it[0]}" } - .set { ch_input_files } -} else { - println("Thank you for using Donut Falls!\n") - println("Please set a sample sheet with params.sample_sheet or a directory with params.reads!") - exit 0 -} - -ch_remove = Channel.fromPath("${params.remove}", type: "file") - -workflow { - ch_illumina = Channel.empty() - ch_fastq = Channel.empty() - ch_fasta = Channel.empty() - ch_consensus = Channel.empty() - ch_summary = Channel.empty() - - - - if ( params.assembler == 'flye' || params.assembler == 'raven' || params.assembler == 'miniasm' || params.assembler == 'lr_unicycler' || params.assembler == 'dragonflye' ) { - if ( params.test_wf ) { - test() - ch_input_files = ch_input_files.mix(test.out.fastq) - } - - filter(ch_input_files) - assembly(filter.out.fastq) - - ch_fastq = ch_fasta.mix(filter.out.fastq) - ch_illumina = ch_illumina.mix(filter.out.reads) - ch_fasta = ch_fasta.mix(assembly.out.fasta) - ch_summary = ch_summary.mix(filter.out.summary).mix(assembly.out.summary) - - } else if ( params.assembler == 'unicycler' || params.assembler == 'masurca' ) { - hybrid(ch_input_files.filter{it -> it[2]}) - ch_consensus = ch_consensus.mix(hybrid.out.fasta) - ch_summary = ch_summary.mix(hybrid.out.summary) - - } else if ( params.assembler == 'trycycler' ) { - if ( params.test_wf ) { - test() - ch_input_files = ch_input_files.mix(test.out.fastq) - } - - filter(ch_input_files) - ch_fastq = ch_fasta.mix(filter.out.fastq) - ch_illumina = ch_illumina.mix(filter.out.reads) - ch_summary = ch_summary.mix(filter.out.summary) - - trycycler(filter.out.fastq, ch_remove.ifEmpty([])) - ch_fasta = ch_fasta.mix(trycycler.out.fasta) - ch_summary = ch_summary.mix(trycycler.out.summary) - } - - nanoplot(ch_sequencing_summary) - polish(ch_fastq, ch_fasta, ch_illumina.ifEmpty([])) - ch_consensus = ch_consensus.mix(polish.out.fasta) - - metrics( - ch_input_files.map{it -> tuple (it[0], it[1])}, - ch_consensus, - ch_summary.ifEmpty([])) - - copy(ch_consensus.map{it -> tuple(it[1])}.collect()) -} - -workflow.onComplete { - println("Pipeline completed at: $workflow.complete") - println("The multiqc report can be found at ${params.outdir}/multiqc/multiqc_report.html") - println("The consensus fasta files can be found in ${params.outdir}/consensus") - println("The fasta files are from each phase of assembly. polca > polypolish > medaka > unpolished") - println("Execution status: ${ workflow.success ? 'OK' : 'failed' }") -} \ No newline at end of file diff --git a/modules/bandage.nf b/modules/bandage.nf deleted file mode 100644 index c827d56..0000000 --- a/modules/bandage.nf +++ /dev/null @@ -1,23 +0,0 @@ -process bandage { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 1 - container 'quay.io/biocontainers/bandage:0.8.1--hc9558a2_2' - - input: - tuple val(sample), file(gfa) - - output: - tuple val(sample), path("bandage/${sample}.{png,svg}"), emit: fastq - path "bandage/${sample}_mqc.png", emit: summary - - shell: - ''' - mkdir -p bandage - - Bandage image !{gfa} bandage/!{sample}.png !{params.bandage_options} - Bandage image !{gfa} bandage/!{sample}.svg !{params.bandage_options} - - cp bandage/!{sample}.png bandage/!{sample}_mqc.png - ''' -} \ No newline at end of file diff --git a/modules/bgzip.nf b/modules/bgzip.nf deleted file mode 100644 index 4887c4a..0000000 --- a/modules/bgzip.nf +++ /dev/null @@ -1,21 +0,0 @@ -process bgzip { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 1 - container 'staphb/htslib:1.17' - - input: - tuple val(sample), file(fastq) - - output: - tuple val(sample), path("filtlong/${fastq}.gz"), emit: fastq - - shell: - ''' - mkdir -p filtlong - bgzip --version - - bgzip -@ !{task.cpus} !{fastq} - mv !{fastq}.gz filtlong/. - ''' -} \ No newline at end of file diff --git a/modules/busco.nf b/modules/busco.nf deleted file mode 100644 index f08d8ac..0000000 --- a/modules/busco.nf +++ /dev/null @@ -1,26 +0,0 @@ -process busco { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 1 - container 'ezlabgva/busco:v5.4.5_cv1' - errorStrategy 'ignore' - - input: - tuple val(sample), file(fasta) - - output: - path("busco/${sample}/*") - path("busco/${sample}/short_summary*.txt"), optional: true, emit: summary - - shell: - ''' - busco --version - - busco !{params.busco_options} \ - -m genome \ - -i !{fasta} \ - -o busco/!{sample} \ - --cpu !{task.cpus} \ - --auto-lineage-prok - ''' -} \ No newline at end of file diff --git a/modules/bwa.nf b/modules/bwa.nf deleted file mode 100644 index a93e64f..0000000 --- a/modules/bwa.nf +++ /dev/null @@ -1,21 +0,0 @@ -process bwa { - tag "${sample}" - label "maxcpus" - container 'staphb/bwa:0.7.17' - cpus 6 - - input: - tuple val(sample), file(fasta), file(fastq) - - output: - tuple val(sample), file(fasta), file("bwa/${sample}_{1,2}.sam"), emit: sam - - shell: - ''' - mkdir -p bwa - - bwa index !{fasta} - bwa mem -t !{task.cpus} -a !{fasta} !{fastq[0]} > bwa/!{sample}_1.sam - bwa mem -t !{task.cpus} -a !{fasta} !{fastq[1]} > bwa/!{sample}_2.sam - ''' -} diff --git a/modules/circlator.nf b/modules/circlator.nf deleted file mode 100644 index 16666af..0000000 --- a/modules/circlator.nf +++ /dev/null @@ -1,40 +0,0 @@ -process circlator { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 1 - //stageInMode 'copy' - container 'staphb/circlator:1.5.5' - //errorStrategy 'ignore' - - input: - tuple val(sample), file(fasta) - - output: - tuple val(sample), file("circlator/${sample}_unpolished.fasta"), emit: fasta - path "circlator/${sample}*", emit: directory - path "circlator/${sample}_fixstart_summary.csv", emit: summary - - shell: - ''' - mkdir -p circlator - - circlator version - - nucmer --version - - touch test_circular.fasta - cat *circular.fasta > circular.fa - - circlator fixstart !{params.circlator_options} \ - circular.fa \ - circlator/!{sample}_fixstart - - cp circlator/!{sample}_fixstart.fasta circlator/!{sample}_unpolished.fasta - - touch test_open.fasta - cat *open.fasta >> circlator/!{sample}_unpolished.fasta - - head -n 1 circlator/!{sample}_fixstart.log | tr "\\t" "," | awk '{print "sample," $0 }' > circlator/!{sample}_fixstart_summary.csv - tail -n+2 circlator/!{sample}_fixstart.log | tr "\\t" "," | awk -v sample=!{sample} '{print sample "," $0 }' >> circlator/!{sample}_fixstart_summary.csv - ''' -} \ No newline at end of file diff --git a/modules/copy.nf b/modules/copy.nf deleted file mode 100644 index 9aced21..0000000 --- a/modules/copy.nf +++ /dev/null @@ -1,21 +0,0 @@ -process copy { - publishDir "${params.outdir}", mode: 'copy' - tag "putting all fasta files in ${params.outdir}/consensus" - container 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' - - input: - file(fasta) - - output: - path "consensus/" - - shell: - ''' - mkdir consensus - - for fasta in !{fasta} - do - cat $fasta | sed 's/_length/ length/g' | sed 's/_circular/ circular/g' | sed 's/_polypolish//g' > consensus/$fasta - done - ''' -} diff --git a/modules/download.nf b/modules/download.nf deleted file mode 100644 index 83ac510..0000000 --- a/modules/download.nf +++ /dev/null @@ -1,28 +0,0 @@ -process download { - tag "Downloading subset15000" - cpus 1 - container 'staphb/gfastats:1.3.6' - - output: - tuple val("subset15000"), file("nfcore_subset15000.fa.gz"), emit: fastq - - shell: - ''' - wget -q https://github.com/nf-core/test-datasets/blob/23f5b889e4736798c8692e9b92810d9a3e37ee97/nanopore/subset15000.fq.gz?raw=true -O nfcore_subset15000.fa.gz - ''' -} - -process great_dataset { - tag "Downloading the great dataset" - cpus 1 - container 'staphb/gfastats:1.3.6' - - output: - tuple val("great_dataset"), file("reads.fastq.gz"), emit: fastq - - shell: - ''' - wget -q https://bridges.monash.edu/ndownloader/files/23754659 -O great_dataset.tar.gz - tar -xvf great_dataset.tar.gz - ''' -} \ No newline at end of file diff --git a/modules/dragonflye.nf b/modules/dragonflye.nf deleted file mode 100644 index 8293880..0000000 --- a/modules/dragonflye.nf +++ /dev/null @@ -1,36 +0,0 @@ -process dragonflye { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 12 - container 'staphb/dragonflye:1.0.14' - - input: - tuple val(sample), file(fastq) - - output: - tuple val(sample), file("dragonflye/${sample}/${sample}_dragonflye.fasta"), optional: true, emit: fasta - tuple val(sample), file("dragonflye/${sample}/${sample}_dragonflye.gfa"), optional: true, emit: gfa - path "dragonflye/${sample}/${sample}_assembly_info.tsv", emit: summary - path "dragonflye/${sample}/*" - - shell: - ''' - mkdir -p dragonflye - - dragonflye --version - - dragonflye !{params.dragonflye_options} \ - --reads !{fastq} \ - --cpus !{task.cpus} \ - --outdir dragonflye/!{sample} \ - --prefix !{sample} - - # renaming final files - if [ -f "dragonflye/!{sample}/flye-unpolished.gfa" ] ; then cp dragonflye/!{sample}/flye-unpolished.gfa dragonflye/!{sample}/!{sample}_dragonflye.gfa ; fi - if [ -f "dragonflye/!{sample}/flye.fasta" ] ; then cp dragonflye/!{sample}/flye.fasta dragonflye/!{sample}/!{sample}_dragonflye.fasta ; fi - - # getting a summary file - head -n 1 dragonflye/!{sample}/flye-info.txt | awk '{print "sample\\t" $0}' > dragonflye/!{sample}/!{sample}_assembly_info.tsv - tail -n+2 dragonflye/!{sample}/flye-info.txt | awk -v sample=!{sample} '{print sample "\\t" $0}' >> dragonflye/!{sample}/!{sample}_assembly_info.tsv - ''' -} \ No newline at end of file diff --git a/modules/fastp.nf b/modules/fastp.nf deleted file mode 100644 index 0c3e777..0000000 --- a/modules/fastp.nf +++ /dev/null @@ -1,33 +0,0 @@ -process fastp { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 1 - container 'staphb/fastp:0.23.2' - - when: - sample != null - - input: - tuple val(sample), file(reads) - - output: - tuple val(sample), file("fastp/${sample}_fastp_{R1,R2}.fastq.gz"), emit: reads - path "fastp/*" - path "fastp/${sample}_fastp.json", emit: summary - - shell: - ''' - mkdir -p fastp - fastp --version - - fastp !{params.fastp_options} \ - --in1 !{reads[0]} \ - --in2 !{reads[1]} \ - --out1 fastp/!{sample}_fastp_R1.fastq.gz \ - --out2 fastp/!{sample}_fastp_R2.fastq.gz \ - --unpaired1 fastp/!{sample}_u.fastq.gz \ - --unpaired2 fastp/!{sample}_u.fastq.gz \ - -h fastp/!{sample}_fastp.html \ - -j fastp/!{sample}_fastp.json - ''' -} \ No newline at end of file diff --git a/modules/filtlong.nf b/modules/filtlong.nf deleted file mode 100644 index 278d017..0000000 --- a/modules/filtlong.nf +++ /dev/null @@ -1,36 +0,0 @@ -process filtlong { - tag "${sample}" - cpus 1 - container 'staphb/filtlong:0.2.1' - - input: - tuple val(sample), file(fastq), file(short_reads) - - output: - tuple val(sample), file("filtlong/${sample}_filtered.fastq"), optional: true, emit: fastq - - shell: - if (short_reads[1] == null) { - ''' - mkdir -p filtlong - - filtlong --version - - filtlong !{params.filtlong_options} \ - !{fastq} \ - > filtlong/!{sample}_filtered.fastq - ''' - } else { - ''' - mkdir -p filtlong - - filtlong --version - - filtlong !{params.filtlong_options} \ - -1 !{short_reads[0]} \ - -2 !{short_reads[1]} \ - !{fastq} \ - > filtlong/!{sample}_filtered.fastq - ''' - } -} \ No newline at end of file diff --git a/modules/flye.nf b/modules/flye.nf deleted file mode 100644 index c013cc6..0000000 --- a/modules/flye.nf +++ /dev/null @@ -1,36 +0,0 @@ -process flye { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 12 - container 'staphb/flye:2.9.2' - errorStrategy 'ignore' - - input: - tuple val(sample), file(fastq) - - output: - tuple val(sample), file("flye/${sample}/${sample}_flye.fasta"), optional: true, emit: fasta - tuple val(sample), file("flye/${sample}/${sample}_flye.gfa"), optional: true, emit: gfa - path "flye/${sample}/${sample}_assembly_info.tsv", emit: summary - path "flye/${sample}/*" - - shell: - ''' - mkdir -p flye/!{sample} - - flye --version - - flye !{params.flye_options} \ - --nano-raw !{fastq} \ - --threads !{task.cpus} \ - --out-dir flye/!{sample} - - # renaming final files - if [ -f "flye/!{sample}/assembly.fasta" ] ; then cp flye/!{sample}/assembly.fasta flye/!{sample}/!{sample}_flye.fasta ; fi - if [ -f "flye/!{sample}/assembly_graph.gfa" ] ; then cp flye/!{sample}/assembly_graph.gfa flye/!{sample}/!{sample}_flye.gfa ; fi - - # getting a summary file - head -n 1 flye/!{sample}/assembly_info.txt | awk '{print "sample\\t" $0}' > flye/!{sample}/!{sample}_assembly_info.tsv - tail -n+2 flye/!{sample}/assembly_info.txt | awk -v sample=!{sample} '{print sample "\\t" $0}' >> flye/!{sample}/!{sample}_assembly_info.tsv - ''' -} \ No newline at end of file diff --git a/modules/gfastats.nf b/modules/gfastats.nf deleted file mode 100644 index 89d2d6d..0000000 --- a/modules/gfastats.nf +++ /dev/null @@ -1,52 +0,0 @@ -process gfastats { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 1 - container 'staphb/gfastats:1.3.6' - - input: - tuple val(sample), file(gfa) - - output: - tuple val(sample), file("gfastats/${sample}_*_{circular,open}.fasta"), optional: true, emit: fasta - tuple val(sample), file("gfastats/${sample}.fasta"), optional: true, emit: assembly - path "gfastats/*" - path "gfastats/${sample}_gfastats_summary.csv", emit: summary - - shell: - ''' - mkdir -p gfastats - gfastats --version - - gfastats \ - !{gfa} \ - !{params.gfastats_options} \ - --threads !{task.cpus} \ - --tabular \ - --seq-report > gfastats/!{sample}_gfastats.txt - - while read line - do - header=$(echo $line | cut -f 2 -d ',') - length=$(echo $line | cut -f 4 -d ',') - circ=$(echo $line | cut -f 11 -d ',') - - if [ "$length" -ge 200 ] - then - if [[ "$circ" == "Y" ]] - then - echo ">$header length=$length circular=true" > gfastats/!{sample}_${header}_circular.fasta - grep -w "^S" !{gfa} | grep -w $header | awk '{print $3}' >> gfastats/!{sample}_${header}_circular.fasta - else - echo ">$header length=$length circular=false" > gfastats/!{sample}_${header}_open.fasta - grep -w "^S" !{gfa} | awk -v header=$header '{if ($2 == header) print $0}' | awk '{print $3}' >> gfastats/!{sample}_${header}_open.fasta - fi - fi - echo ">!{sample}_${header} length=$length circular=$circ" >> gfastats/!{sample}.fasta - grep -w "^S" !{gfa} | grep -w $header | awk '{print $3}' >> gfastats/!{sample}.fasta - done < <(grep -v Header gfastats/!{sample}_gfastats.txt | tr "\\t" ",") - - head -n 1 gfastats/!{sample}_gfastats.txt | tr "\\t" "," | awk '{print "sample," $0 "circular" }' > gfastats/!{sample}_gfastats_summary.csv - tail -n+2 gfastats/!{sample}_gfastats.txt | tr "\\t" "," | awk -v sample=!{sample} '{print sample "," $0 }' >> gfastats/!{sample}_gfastats_summary.csv - ''' -} diff --git a/modules/masurca.nf b/modules/masurca.nf deleted file mode 100644 index d04292d..0000000 --- a/modules/masurca.nf +++ /dev/null @@ -1,66 +0,0 @@ -process masurca { - publishDir "${params.outdir}", mode: 'copy' - stageInMode 'copy' - - tag "${sample}" - cpus 12 - container 'staphb/masurca:4.1.0' - - input: - tuple val(sample), file(nanopore), file(fastq) - - output: - path "masurca/${sample}" - tuple val(sample), file("masurca/${sample}/${sample}_primary.genome.scf.fasta"), emit: fasta - - shell: - ''' - mkdir -p masurca/!{sample} - - masurca --version - - masurca !{params.masurca_options} \ - -t !{task.cpus} \ - -i !{fastq[0]},!{fastq[1]} \ - -r !{nanopore} - - for dir in ls -d CA* - do - mv $dir masurca/!{sample}/. - done - - cp $(ls masurca/!{sample}/CA*/primary.genome.scf.fasta) masurca/!{sample}/!{sample}_primary.genome.scf.fasta - ''' -} - -process polca { - tag "${sample}" - cpus 6 - container 'staphb/masurca:latest' - publishDir "${params.outdir}", mode: 'copy' - - input: - tuple val(sample), file(fasta), file(fastq) - - output: - tuple val(sample), file("polca/${sample}/${sample}_polca_polished.fa"), optional: true, emit: fasta - path "polca/${sample}/*", emit: directory - - shell: - ''' - mkdir -p polca/!{sample} - - masurca --version - - cp !{fasta} !{sample}.fasta - - polca.sh !{params.polca_options} \ - -r '!{fastq}' \ - -a !{sample}.fasta \ - -t !{task.cpus} - - mv !{sample}.fasta* polca/!{sample}/. - - if [ -f "polca/!{sample}/!{sample}.fasta.PolcaCorrected.fa" ] ; then cp polca/!{sample}/!{sample}.fasta.PolcaCorrected.fa polca/!{sample}/!{sample}_polca_polished.fa ; fi - ''' -} \ No newline at end of file diff --git a/modules/medaka.nf b/modules/medaka.nf deleted file mode 100644 index 903a247..0000000 --- a/modules/medaka.nf +++ /dev/null @@ -1,30 +0,0 @@ -process medaka { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 6 - container 'ontresearch/medaka:v1.7.3' - - input: - tuple val(sample), path(fasta), path(fastq) - - output: - path "medaka/${sample}/", emit: directory - tuple val(sample), path("medaka/${sample}/${sample}_medaka_consensus.fasta"), emit: fasta - - shell: - ''' - mkdir -p medaka - - medaka --version - - cat !{fasta} | sed 's/ /_/g' > !{fasta}.fasta - - medaka_consensus !{params.medaka_options} \ - -i !{fastq} \ - -d !{fasta}.fasta \ - -o medaka/!{sample} \ - -t !{task.cpus} - - cp medaka/!{sample}/consensus.fasta medaka/!{sample}/!{sample}_medaka_consensus.fasta - ''' -} \ No newline at end of file diff --git a/modules/miniasm.nf b/modules/miniasm.nf deleted file mode 100644 index 9419b83..0000000 --- a/modules/miniasm.nf +++ /dev/null @@ -1,28 +0,0 @@ - process miniasm { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 12 - container 'staphb/minipolish:0.1.3' - - input: - tuple val(sample), file(fastq) - - output: - tuple val(sample), path("miniasm/${sample}/${sample}_miniasm.gfa"), emit: gfa - path "miniasm/${sample}/*", emit: directory - - shell: - ''' - mkdir -p miniasm/!{sample} - - echo "miniasm version : $(miniasm -V)" - minimap2 --version - minipolish --version - - miniasm_and_minipolish.sh \ - !{fastq} \ - !{task.cpus} \ - > miniasm/!{sample}/!{sample}_miniasm.gfa - ''' -} - diff --git a/modules/multiqc.nf b/modules/multiqc.nf deleted file mode 100644 index cc2f0d4..0000000 --- a/modules/multiqc.nf +++ /dev/null @@ -1,35 +0,0 @@ -process multiqc { - publishDir "${params.outdir}", mode: 'copy' - tag "multiqc" - container 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' - - - //fastp - //filtlong - //porechop - //quast - //busco - - input: - file(input) - - output: - path "multiqc/multiqc_report.html" - path "multiqc/multiqc_data/*" - - shell: - ''' - mkdir -p multiqc - multiqc --version - - if [ -f "circlator_summary.csv" ] ; then mv circlator_summary.csv circlator_summary_mqc.csv ; fi - if [ -f "flye_summary.tsv" ] ; then mv flye_summary.tsv flye_summary_mqc.tsv ; fi - if [ -f "dragonflye_summary.tsv" ] ; then mv dragonflye_summary.tsv dragonflye_summary_mqc.tsv ; fi - if [ -f "gfastats_summary.csv" ] ; then mv gfastats_summary.csv gfastats_summary_mqc.csv ; fi - if [ -f "NanoStats.csv" ] ; then mv NanoStats.csv NanoStats_mqc.csv ; fi - - multiqc !{params.multiqc_options} \ - --outdir multiqc \ - . - ''' -} diff --git a/modules/nanoplot.nf b/modules/nanoplot.nf deleted file mode 100644 index 283093b..0000000 --- a/modules/nanoplot.nf +++ /dev/null @@ -1,56 +0,0 @@ -process nanoplot_summary { - publishDir "${params.outdir}", mode: 'copy' - tag "${sequencing_summary}" - cpus 6 - container 'staphb/nanoplot:1.40.0' - - input: - file(sequencing_summary) - - output: - path "nanoplot/summary", emit: final_directory - - shell: - ''' - mkdir -p nanoplot/summary - NanoPlot --version - - NanoPlot !{params.nanoplot_summary_options} \ - --summary !{sequencing_summary} \ - --threads !{task.cpus} \ - --outdir nanoplot/summary \ - --tsv_stats - ''' -} - -process nanoplot { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 6 - container 'staphb/nanoplot:1.40.0' - - input: - tuple val(sample), file(fastq) - - output: - path "nanoplot/${sample}", emit: final_directory - path "nanoplot/${sample}/${sample}_NanoStats.csv", emit: summary - - shell: - ''' - mkdir -p nanoplot/!{sample} - - NanoPlot --version - - NanoPlot !{params.nanoplot_options} \ - --fastq !{fastq} \ - --threads !{task.cpus} \ - --tsv_stats \ - --outdir nanoplot/!{sample} - - cp nanoplot/!{sample}/NanoStats.txt nanoplot/!{sample}/!{sample}_NanoStats.txt - - echo "sample,$(cut -f 1 nanoplot/!{sample}/!{sample}_NanoStats.txt | tr '\\n' ',' )" > nanoplot/!{sample}/!{sample}_NanoStats.csv - echo "!{sample},$(cut -f 2 nanoplot/!{sample}/!{sample}_NanoStats.txt | tr '\\n' ',' )" >> nanoplot/!{sample}/!{sample}_NanoStats.csv - ''' -} diff --git a/modules/polypolish.nf b/modules/polypolish.nf deleted file mode 100644 index 9d468f3..0000000 --- a/modules/polypolish.nf +++ /dev/null @@ -1,26 +0,0 @@ -process polypolish { - tag "${sample}" - cpus 6 - container 'staphb/polypolish:0.5.0' - publishDir "${params.outdir}", mode: 'copy' - - input: - tuple val(sample), file(fasta), file(sam) - - output: - tuple val(sample), file("polypolish/${sample}_polypolish.fasta"), emit: fasta - - shell: - ''' - mkdir -p polypolish - - polypolish_insert_filter.py --in1 !{sam[0]} --in2 !{sam[1]} --out1 !{sample}_filtered_1.sam --out2 !{sample}_filtered_2.sam - - polypolish !{params.polypolish_options} \ - !{fasta} \ - !{sample}_filtered_1.sam \ - !{sample}_filtered_2.sam > polypolish/!{sample}_polypolish.fasta - ''' -} - -// in polypolish container python3 -m pip install edlib mappy \ No newline at end of file diff --git a/modules/porechop.nf b/modules/porechop.nf deleted file mode 100644 index f960968..0000000 --- a/modules/porechop.nf +++ /dev/null @@ -1,24 +0,0 @@ -process porechop { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 6 - container 'staphb/porechop:0.2.4' - - input: - tuple val(sample), file(fastq) - - output: - tuple val(sample), file("porechop/${sample}_chopped.fastq.gz"), emit: fastq - - shell: - ''' - mkdir -p porechop - - porechop --version - - porechop !{params.porechop_options} \ - --threads !{task.cpus} \ - -i !{fastq} \ - -o porechop/!{sample}_chopped.fastq.gz - ''' -} \ No newline at end of file diff --git a/modules/rasusa.nf b/modules/rasusa.nf deleted file mode 100644 index 8c9d9a2..0000000 --- a/modules/rasusa.nf +++ /dev/null @@ -1,28 +0,0 @@ -process rasusa { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 6 - container 'staphb/rasusa:0.7.0' - - input: - tuple val(sample), file(fastq) - - output: - tuple val(sample), file("rasusa/${sample}/*.fastq"), emit: fastq - - shell: - ''' - mkdir -p rasusa/!{sample} - - rasusa --version - - for i in 01 02 03 04 05 06 07 08 09 10 11 12 - do - rasusa !{params.rasusa_options} \ - -i !{fastq} \ - --output rasusa/!{sample}/sample_$i.fastq - done - - - ''' -} \ No newline at end of file diff --git a/modules/raven.nf b/modules/raven.nf deleted file mode 100644 index 4a0da3e..0000000 --- a/modules/raven.nf +++ /dev/null @@ -1,28 +0,0 @@ -process raven { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 12 - container 'staphb/raven:1.8.1' - errorStrategy 'ignore' - - input: - tuple val(sample), file(fastq) - - output: - tuple val(sample), file("raven/${sample}/${sample}_raven.fasta"), emit: fasta - tuple val(sample), file("raven/${sample}/${sample}_raven.gfa"), emit: gfa - path("raven/${sample}/*"), emit: directory - - shell: - ''' - mkdir -p raven/!{sample} - - raven --version - - raven !{params.raven_options} \ - --threads !{task.cpus} \ - --graphical-fragment-assembly raven/!{sample}/!{sample}_raven.gfa \ - !{fastq} \ - > raven/!{sample}/!{sample}_raven.fasta - ''' -} \ No newline at end of file diff --git a/modules/summary.nf b/modules/summary.nf deleted file mode 100644 index 14d214f..0000000 --- a/modules/summary.nf +++ /dev/null @@ -1,47 +0,0 @@ -process summary { - tag "${sample}" - cpus 1 - publishDir "${params.outdir}", mode: 'copy' - container 'staphb/gfastats:1.3.6' - - input: - tuple val(sample), - val(num_contigs), - val(num_closed_contigs), - val(chr_closed), - val(chr_cov), - val(genome_size), - val(nanoplot_mean_read_length), - val(nanoplot_mean_read_quality), - val(nanoplot_median_read_length), - val(nanoplot_median_read_quality), - val(nanoplot_number_of_reads), - val(nanoplot_read_length_N), - val(nanoplot_total_bases) from results - - output: - path "summary/${sample}.summary.tsv", emit: summary - path "logs/summary/${sample}.${workflow.sessionId}.{log,err}", emit: logs - - shell: - ''' - mkdir -p logs/summary summary - log_file=logs/summary/!{sample}.!{workflow.sessionId}.log - err_file=logs/summary/!{sample}.!{workflow.sessionId}.err - - # time stamp + capturing tool versions - date | tee -a $log_file $err_file > /dev/null - - header="sample\tnumber_of_contigs\tclosed_contigs\tchr_cov\tchr_closed\tgenome_size" - result="!{sample}\t!{num_contigs}\t!{num_closed_contigs}\t!{chr_cov}\t!{chr_closed}\t!{genome_size}" - - if [ "!{params.nanoplot}" != "false" ] - then - header="$header\tmean_read_length\tmean_read_quality\tmedian_read_length\tmediean_read_quality\tnumber_of_reads\tread_length_N50\ttotal_bases" - result="$result\t!{nanoplot_mean_read_length}\t!{nanoplot_mean_read_quality}\t!{nanoplot_median_read_length}\t!{nanoplot_median_read_quality}\t!{nanoplot_number_of_reads}\t!{nanoplot_read_length_N}\t!{nanoplot_total_bases}" - fi - - echo -e "$header" > summary/!{sample}.summary.tsv - echo -e "$result" >> summary/!{sample}.summary.tsv - ''' -} \ No newline at end of file diff --git a/modules/trycycler.nf b/modules/trycycler.nf deleted file mode 100644 index b083734..0000000 --- a/modules/trycycler.nf +++ /dev/null @@ -1,212 +0,0 @@ -process subsample { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 12 - container 'staphb/trycycler:0.5.4' - - input: - tuple val(sample), file(fastq) - - output: - tuple val(sample), file("trycycler/subsample/${sample}/*fastq"), optional: true, emit: fastq - tuple val(sample), file("trycycler/notsubsample/${sample}/*fastq.gz"), optional: true, emit: full - - shell: - ''' - mkdir -p trycycler/subsample/!{sample} trycycler/notsubsample/!{sample} - - trycycler --version - - trycycler subsample !{params.trycycler_subsample_options} \ - --reads !{fastq} \ - --threads !{task.cpus} \ - --out_dir trycycler/subsample/!{sample} || \ - cp !{fastq} trycycler/notsubsample/!{sample}/. - ''' -} - -process cluster { - publishDir "${params.outdir}/trycycler", mode: 'copy' - tag "${sample}" - cpus 12 - container 'staphb/trycycler:0.5.4' - - input: - tuple val(sample), file(fasta), file(fastq) - - output: - path "trycycler/${sample}/contigs.{newick,phylip}" - path "trycycler/${sample}/cluster*/*contigs/*.fasta" - tuple val(sample), path("trycycler/${sample}/cluster*"), emit: cluster - - shell: - ''' - mkdir -p trycycler/!{sample} - trycycler --version - - trycycler cluster !{params.trycycler_cluster_options} \ - --threads !{task.cpus} \ - --assemblies !{fasta} \ - --reads !{fastq} \ - --out_dir trycycler/!{sample} - ''' -} - -process dotplot { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}_${cluster}" - cpus 1 - container 'staphb/trycycler:0.5.4' - - input: - tuple val(sample), path(cluster) - - output: - path("trycycler/dotplot/${sample}_${cluster}_dotplots.png") - - shell: - ''' - mkdir -p trycycler/dotplot - - trycycler --version - - trycycler dotplot !{params.trycycler_dotplot_options} \ - --cluster_dir !{cluster} - - cp !{cluster}/dotplots.png trycycler/dotplot/!{sample}_!{cluster}_dotplots.png - ''' -} - -process reconcile { - tag "${sample}_${cluster}" - cpus 12 - //errorStrategy 'finish' - container 'staphb/trycycler:0.5.4' - - input: - tuple val(sample), path(cluster), file(fastq), file(remove) - - output: - tuple val(sample), path(cluster), optional:true, emit: cluster - - shell: - ''' - trycycler --version - - if [ -f "!{remove}" ] - then - while read line - do - cluster=$(echo $line | cut -f 2 -d ,) - file=$(echo $line | cut -f 3 -d ,) - if [ -f "$cluster/1_contigs/$file.fasta" ] ; then mv $cluster/1_contigs/$file.fasta $cluster/1_contigs/$file.fasta_remove ; fi - done < <(grep ^!{sample}, !{remove}) - fi - - num_fasta=$(ls !{cluster}/1_contigs/*.fasta | wc -l) - echo "There are $num_fasta in !{cluster} for !{sample}" - if [ "$num_fasta" -ge "!{params.trycycler_min_fasta}" ] - then - trycycler reconcile !{params.trycycler_reconcile_options} \ - --reads !{fastq} \ - --cluster_dir !{cluster} \ - --threads !{task.cpus} - - ls - - echo "There are $num_fasta in !{cluster} for !{sample}" - - ls !{cluster}/2_all_seqs.fasta - else - echo "!{sample} cluster !{cluster} only had $num_fasta fastas" - mv !{cluster} !{cluster}_cluster_too_small - fi - ''' -} - -process msa { - tag "${sample}_${cluster}" - cpus 12 - container 'staphb/trycycler:0.5.4' - - input: - tuple val(sample), path(cluster) - - output: - tuple val(sample), path(cluster), emit: cluster - - shell: - ''' - trycycler --version - - trycycler msa !{params.trycycler_msa_options} \ - --cluster_dir !{cluster} \ - --threads !{task.cpus} - ''' -} - -process partition { - tag "${sample}_${cluster}" - cpus 12 - container 'staphb/trycycler:0.5.4' - - input: - tuple val(sample), path(cluster), file(fastq) - - output: - tuple val(sample), path(cluster), emit: cluster - - shell: - ''' - trycycler --version - - trycycler partition !{params.trycycler_partition_options} \ - --reads !{fastq} \ - --cluster_dirs !{cluster} \ - --threads !{task.cpus} - ''' -} - -process consensus { - publishDir "${params.outdir}/trycycler/${sample}", mode: 'copy' - tag "${sample}_${cluster}" - cpus 12 - container 'staphb/trycycler:0.5.4' - - input: - tuple val(sample), path(cluster) - - output: - tuple val(sample), path("${cluster}/${sample}_*_trycycler_consensus.fasta"), emit: fasta - path "${cluster}" - - shell: - ''' - trycycler --version - - trycycler consensus !{params.trycycler_consensus_options} \ - --cluster_dir !{cluster} \ - --threads !{task.cpus} - - cp !{cluster}/7_final_consensus.fasta !{cluster}/!{sample}_!{cluster}_trycycler_consensus.fasta - ''' -} - -process combine { - publishDir "${params.outdir}", mode: 'copy' - tag "${sample}" - cpus 1 - container 'staphb/trycycler:0.5.4' - - input: - tuple val(sample), file(fasta) - - output: - tuple val(sample), path("trycycler/${sample}_trycycler_consensus.fasta"), emit: fasta - - shell: - ''' - mkdir -p trycycler - cat !{fasta} > trycycler/!{sample}_trycycler_consensus.fasta - ''' -} \ No newline at end of file diff --git a/modules/unicycler.nf b/modules/unicycler.nf deleted file mode 100644 index b3bc2e2..0000000 --- a/modules/unicycler.nf +++ /dev/null @@ -1,66 +0,0 @@ -process unicycler { - tag "${sample}" - cpus 12 - publishDir "${params.outdir}", mode: 'copy' - container 'staphb/unicycler:0.5.0' - //errorStrategy 'ignore' - - when: - illumina != null - - input: - tuple val(sample), file(nanopore), file(illumina) - - output: - path "unicycler/${sample}", emit: directory - tuple val(sample), file("unicycler/${sample}/${sample}.fasta"), emit: fasta - tuple val(sample), file("unicycler/${sample}/${sample}.gfa"), emit: gfa - - shell: - ''' - mkdir -p unicycler - - unicycler --version - - unicycler !{params.unicycler_options} \ - -1 !{illumina[0]} \ - -2 !{illumina[1]} \ - -l !{nanopore} \ - -o unicycler/!{sample} \ - -t !{task.cpus} - - if [ -f "unicycler/!{sample}/assembly.fasta" ] ; then cp unicycler/!{sample}/assembly.fasta unicycler/!{sample}/!{sample}.fasta ; fi - if [ -f "unicycler/!{sample}/assembly.gfa" ] ; then cp unicycler/!{sample}/assembly.gfa unicycler/!{sample}/!{sample}.gfa ; fi - ''' -} - -process unicycler_long { - tag "${sample}" - cpus 12 - publishDir "${params.outdir}", mode: 'copy' - container 'staphb/unicycler:0.5.0' - errorStrategy 'ignore' - - input: - tuple val(sample), file(nanopore) - - output: - path "unicycler/${sample}", emit: directory - tuple val(sample), file("unicycler/${sample}/${sample}.fasta"), emit: fasta - tuple val(sample), file("unicycler/${sample}/${sample}.gfa"), emit: gfa - - shell: - ''' - mkdir -p unicycler - - unicycler --version - - unicycler !{params.unicycler_options} \ - -l !{nanopore} \ - -o unicycler/!{sample} \ - -t !{task.cpus} - - if [ -f "unicycler/!{sample}/assembly.fasta" ] ; then cp unicycler/!{sample}/assembly.fasta unicycler/!{sample}/!{sample}.fasta ; fi - if [ -f "unicycler/!{sample}/assembly.gfa" ] ; then cp unicycler/!{sample}/assembly.gfa unicycler/!{sample}/!{sample}.gfa ; fi - ''' -} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 16eb199..0706ee1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,9 +1,9 @@ manifest { - mainScript = 'main.nf' + mainScript = 'donut_falls.nf' name = 'Donut Falls' author = 'Erin Young' homePage = 'https://github.com/UPHL-BioNGS/Donut_Falls' - version = '1.1.20230425' + version = '1.5.20240305' defaultBranch = 'main' } @@ -18,31 +18,41 @@ profiles { singularity.autoMounts = true } test { - params.test_wf = true + params.test = true } - test1 { - params.test_wf = true - params.enable_porechop = true - } - test2 { - params.test_wf = true - params.assembler = 'raven' - } - test3 { - params.test_wf = true - params.assembler = 'miniasm' - } - test4 { - params.test_wf = true - params.assembler = 'lr_unicycler' - } - test5 { - params.test_wf = true - params.assembler = 'trycycler' - params.trycycler_min_fasta = 9 - } - test6 { - params.test_wf = true - params.assembler = 'dragonflye' + campy { + includeConfig './configs/1_5.config' } } + +process { + maxRetries = 1 + maxErrors = '-1' + + withLabel:process_single { + cpus = { 1 } + memory = { 6.GB * task.attempt } + time = { 10.m * task.attempt } + } + withLabel:process_low { + cpus = { 2 * task.attempt } + memory = { 12.GB * task.attempt } + time = { 2.h * task.attempt } + } + withLabel:process_medium { + cpus = { 6 * task.attempt } + memory = { 36.GB * task.attempt } + time = { 4.h * task.attempt } + } + withLabel:process_high { + cpus = { 12 * task.attempt } + memory = { 72.GB * task.attempt } + time = { 16.h * task.attempt } + } + withLabel:process_long { + time = { 20.h * task.attempt } + } + withLabel:process_high_memory { + memory = { 200.GB * task.attempt } + } +} diff --git a/workflows/assembly.nf b/workflows/assembly.nf deleted file mode 100644 index 6852dca..0000000 --- a/workflows/assembly.nf +++ /dev/null @@ -1,80 +0,0 @@ -include { bandage } from '../modules/bandage' addParams(params) -include { circlator } from '../modules/circlator' addParams(params) -include { dragonflye } from '../modules/dragonflye' addParams(params) -include { flye } from '../modules/flye' addParams(params) -include { gfastats } from '../modules/gfastats' addParams(params) -include { medaka } from '../modules/medaka' addParams(params) -include { miniasm } from '../modules/miniasm' addParams(params) -include { raven } from '../modules/raven' addParams(params) -include { unicycler_long as unicycler } from '../modules/unicycler' addParams(params) - -workflow assembly { - take: - ch_fastq - - main: - ch_gfa = Channel.empty() - ch_summary = Channel.empty() - - if (params.assembler == 'raven' ) { - raven(ch_fastq) - ch_gfa = ch_gfa.mix(raven.out.gfa) - } else if (params.assembler == 'flye' ) { - flye(ch_fastq) - ch_gfa = ch_gfa.mix(flye.out.gfa) - - flye.out.summary - .collectFile( - storeDir: "${params.outdir}/flye/", - keepHeader: true, - sort: { file -> file.text }, - name: "flye_summary.tsv") - .set { flye_summary } - - ch_summary=ch_summary.mix(flye_summary) - } else if (params.assembler == 'miniasm' ) { - miniasm(ch_fastq) - ch_gfa = ch_gfa.mix(miniasm.out.gfa) - } else if (params.assembler == 'lr_unicycler' ) { - unicycler(ch_fastq) - ch_gfa = ch_gfa.mix(unicycler.out.gfa) - } else if (params.assembler == 'dragonflye' ) { - dragonflye(ch_fastq) - ch_gfa = dragonflye.out.gfa - - dragonflye.out.summary - .collectFile( - storeDir: "${params.outdir}/dragonflye/", - keepHeader: true, - sort: { file -> file.text }, - name: "dragonflye_summary.tsv") - .set { dragonflye_summary } - - ch_summary=ch_summary.mix(dragonflye_summary) - } - - bandage(ch_gfa) - gfastats(ch_gfa) - circlator(gfastats.out.fasta) - - gfastats.out.summary - .collectFile( - storeDir: "${params.outdir}/gfastats/", - keepHeader: true, - sort: { file -> file.text }, - name: "gfastats_summary.csv") - .set { gfastats_summary } - - circlator.out.summary - .collectFile( - storeDir: "${params.outdir}/circlator/", - keepHeader: true, - sort: { file -> file.text }, - name: "circlator_summary.csv") - .set { circlator_summary } - - emit: - fasta = circlator.out.fasta - assembly = gfastats.out.assembly - summary = ch_summary.mix(bandage.out.summary).mix(gfastats_summary).mix(circlator_summary) -} \ No newline at end of file diff --git a/workflows/filter.nf b/workflows/filter.nf deleted file mode 100644 index adcdde9..0000000 --- a/workflows/filter.nf +++ /dev/null @@ -1,26 +0,0 @@ -include { bgzip } from '../modules/bgzip' addParams(params) -include { fastp } from '../modules/fastp' addParams(params) -include { filtlong } from '../modules/filtlong' addParams(params) -include { porechop } from '../modules/porechop' addParams(params) - -workflow filter { - take: - ch_input - - main: - fastp(ch_input.filter({ it[2] }).map { it -> tuple (it[0], it[2])}) - - if ( params.enable_porechop ) { - porechop(ch_input.map {it -> tuple (it[0], it[1])}) - filtlong(porechop.out.fastq.join(fastp.out.reads, by: 0, remainder: true)) - } else { - filtlong(ch_input.map {it -> tuple (it[0], it[1])}.join(fastp.out.reads, by: 0, remainder: true)) - } - - bgzip(filtlong.out.fastq) - - emit: - fastq = bgzip.out.fastq - reads = fastp.out.reads - summary = fastp.out.summary -} \ No newline at end of file diff --git a/workflows/hybrid.nf b/workflows/hybrid.nf deleted file mode 100644 index ac27619..0000000 --- a/workflows/hybrid.nf +++ /dev/null @@ -1,30 +0,0 @@ -include { bandage } from '../modules/bandage' addParams(params) -include { unicycler } from '../modules/unicycler' addParams(params) -include { masurca } from '../modules/masurca' addParams(params) - -workflow hybrid { - take: - ch_input - - main: - ch_consensus = Channel.empty() - ch_gfa = Channel.empty() - - if (params.assembler == "unicycler" ) { - unicycler(ch_input) - ch_gfa = ch_gfa.mix(unicycler.out.gfa) - ch_consensus = ch_consensus.mix(unicycler.out.fasta) - } else if (params.assembler == "masurca") { - // masurca(ch_input) - // ch_consensus = ch_consensus.mix(masurca.out.fasta) - println("We really wanted to support including of masurca, but it became too time consuming.") - println("If this assembler is useful to you, please submit an issue at https://github.com/UPHL-BioNGS/Donut_Falls/issues") - ch_consensus = Channel.empty() - } - - bandage(ch_gfa) - - emit: - fasta = ch_consensus - summary = bandage.out.summary -} \ No newline at end of file diff --git a/workflows/metrics.nf b/workflows/metrics.nf deleted file mode 100644 index 1a5c511..0000000 --- a/workflows/metrics.nf +++ /dev/null @@ -1,26 +0,0 @@ -include { busco } from '../modules/busco' addParams(params) -include { multiqc } from '../modules/multiqc' addParams(params) -include { nanoplot } from '../modules/nanoplot' addParams(params) -//include { quast } from '../modules/quast' addParams(params) - -workflow metrics { - take: - ch_reads - ch_consensus - ch_summary - - main: - nanoplot(ch_reads) - busco(ch_consensus) - - nanoplot.out.summary - .collectFile(name: "NanoStats.csv", - keepHeader: true, - storeDir: "${params.outdir}/nanoplot") - .set { nanostats_summary } - - - multiqc(ch_summary.mix(busco.out.summary).mix(nanostats_summary).collect()) - - -} \ No newline at end of file diff --git a/workflows/polish.nf b/workflows/polish.nf deleted file mode 100644 index 87aad63..0000000 --- a/workflows/polish.nf +++ /dev/null @@ -1,35 +0,0 @@ -include { bwa } from '../modules/bwa' addParams(params) -include { medaka } from '../modules/medaka' addParams(params) -include { polca } from '../modules/masurca' addParams(params) -include { polypolish } from '../modules/polypolish' addParams(params) -//include { pilon } from '../modules/pilon' addParams(params) - -workflow polish { - take: - ch_fastq - ch_fasta - ch_illumina - - main: - medaka(ch_fasta.join(ch_fastq, by:0)) - bwa(medaka.out.fasta.join(ch_illumina, by:0)) - polypolish(bwa.out.sam) - polca(polypolish.out.fasta.join(ch_illumina, by:0)) - - // mostly so that everything goes through busco - ch_medaka_polished = medaka.out.fasta.map{it -> tuple(it[0] + "_medaka" , it[1])} - ch_poly_polished = polypolish.out.fasta.map{it -> tuple(it[0] + "_polypolish", it[1])} - ch_polca_polished = polca.out.fasta.map{it -> tuple(it[0] + "_polca" , it[1])} - - ch_fasta - .mix(ch_medaka_polished) - .mix(ch_poly_polished) - .mix(ch_polca_polished) - .set{ch_consensus} - - // add a process to compress fasta files for download from nf-tower? - - emit: - fasta = ch_consensus - -} \ No newline at end of file diff --git a/workflows/test.nf b/workflows/test.nf deleted file mode 100644 index 2795ae0..0000000 --- a/workflows/test.nf +++ /dev/null @@ -1,18 +0,0 @@ -include { download } from '../modules/download' addParams(params) -include { great_dataset } from '../modules/download' addParams(params) - -workflow test { - input: - - main: - if ( params.assembler == 'trycycler' ) { - great_dataset() - fastq = great_dataset.out.fastq - } else { - download() - fastq = download.out.fastq - } - - emit: - fastq = fastq -} \ No newline at end of file diff --git a/workflows/trycycler.nf b/workflows/trycycler.nf deleted file mode 100644 index 6d3770d..0000000 --- a/workflows/trycycler.nf +++ /dev/null @@ -1,78 +0,0 @@ -include { assembly as flye_assembly } from './assembly' addParams(assembler: 'flye' ) -include { assembly as miniasm_assembly } from './assembly' addParams(assembler: 'miniasm' ) -include { assembly as raven_assembly } from './assembly' addParams(assembler: 'raven') -include { assembly as unicycler_assembly } from './assembly' addParams(assembler: 'lr_unicycler') -include { cluster; consensus; dotplot; msa; partition; reconcile; subsample } from '../modules/trycycler' addParams(params) -include { combine } from '../modules/trycycler' addParams(params) -include { rasusa } from '../modules/rasusa' addParams(params) - -workflow trycycler { - take: - ch_fastq - ch_remove - - main: - subsample(ch_fastq) - rasusa(subsample.out.full) - - subsample.out.fastq - .mix(rasusa.out.fastq) - .multiMap { it -> - flye: tuple (it[0], it[0] + '_flye', [it[1][1], it[1][5], it[1][9]]) - miniasm: tuple (it[0], it[0] + '_miniasm', [it[1][2], it[1][6], it[1][10]]) - raven: tuple (it[0], it[0] + '_raven', [it[1][3], it[1][7], it[1][11]]) - unicycler: tuple (it[0], it[0] + '_unicycler', [it[1][4], it[1][8], it[1][0]]) - } - .set { ch_subsampled } - - flye_assembly(ch_subsampled.flye.transpose().map { it -> tuple( it[1] + it[2].toString().replaceAll(~/.+sample/,"").replaceAll(~/.fastq/,""), it[2] )}) - miniasm_assembly(ch_subsampled.miniasm.transpose().map { it -> tuple( it[1] + it[2].toString().replaceAll(~/.+sample/,"").replaceAll(~/.fastq/,""), it[2] )}) - raven_assembly(ch_subsampled.raven.transpose().map { it -> tuple( it[1] + it[2].toString().replaceAll(~/.+sample/,"").replaceAll(~/.fastq/,""), it[2] )}) - unicycler_assembly(ch_subsampled.unicycler.transpose().map { it -> tuple( it[1] + it[2].toString().replaceAll(~/.+sample/,"").replaceAll(~/.fastq/,""), it[2] )}) - - flye_assembly.out.assembly - .mix(miniasm_assembly.out.assembly) - .mix(raven_assembly.out.assembly) - .mix(unicycler_assembly.out.assembly) - .map { it -> tuple( it[0].replaceAll(~/_flye.+/,"").replaceAll(~/_miniasm.+/,"").replaceAll(~/_raven.+/,"").replaceAll(~/_unicycler.+/,""), it[1])} - .groupTuple() - .join(ch_fastq, by:0) - .set { ch_assemblies } - - cluster(ch_assemblies) - //dotplot(cluster.out.cluster.transpose()) - reconcile(cluster.out.cluster.join(ch_fastq, by: 0).transpose().combine(ch_remove)) - msa(reconcile.out.cluster) - partition(msa.out.cluster.groupTuple().join(ch_fastq, by: 0).transpose()) - consensus(partition.out.cluster) - combine(consensus.out.fasta.groupTuple()) - - flye_assembly.out.summary - .mix(raven_assembly.out.summary) - .mix(unicycler_assembly.out.summary) - .mix(miniasm_assembly.out.summary) - .branch { it -> - gfastats: it =~ /gfastats/ - circlator: it =~ /circlator/ - other: true - } - .set { ch_for_summary } - - // ch_for_summary.gfastats - // .collectFile( - // storeDir: "${params.outdir}/gfastats/", - // keepHeader: true, - // name: "gfastats_summary.csv") - // .set { gfastats_summary } - - // ch_for_summary.circlator - // .collectFile( - // storeDir: "${params.outdir}/circlator/", - // keepHeader: true, - // name: "circlator_summary.csv") - // .set { circlator_summary } - - emit: - fasta = combine.out.fasta - summary = ch_for_summary.other -}