Merge pull request #99 from UPHL-BioNGS/update20230802

Update20230802
UPHL-BioNGS · Aug 4, 2023 · 01a39f2 · 01a39f2
2 parents cf9e235 + d8c43ec
commit 01a39f2
Show file tree

Hide file tree

Showing 26 changed files with 268 additions and 103 deletions.
diff --git a/.github/workflows/ecoli.yml b/.github/workflows/ecoli.yml
@@ -38,5 +38,12 @@ jobs:
 
           nextflow run . -profile docker --maxcpus 2 --medcpus 2
           cat grandeur/grandeur_summary.tsv
-          cat grandeur/shigatyper/shigatyper_results.txt
-          cat grandeur/serotypefinder/serotypefinder_results.txt
+          
+      - name: Check E. coli file
+        run: |
+          for file in grandeur/shigatyper/shigatyper_results.txt grandeur/serotypefinder/serotypefinder_results.txt
+          do
+            head $file
+            wc -l $file
+          done
+  
diff --git a/.github/workflows/just_msa.yml b/.github/workflows/just_msa.yml
@@ -24,13 +24,21 @@ jobs:
         run: |
           docker --version
 
-          wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/864/595/GCA_904864595.1_INF333/GCA_904864595.1_INF333_genomic.fna.gz             && gzip -d GCA_904864595.1_INF333_genomic.fna.gz
-          wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/783/245/GCA_013783245.1_ASM1378324v1/GCA_013783245.1_ASM1378324v1_genomic.fna.gz && gzip -d GCA_013783245.1_ASM1378324v1_genomic.fna.gz
-          wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/026/626/185/GCA_026626185.1_ASM2662618v1/GCA_026626185.1_ASM2662618v1_genomic.fna.gz && gzip -d GCA_026626185.1_ASM2662618v1_genomic.fna.gz 
-          wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/808/985/GCA_020808985.1_ASM2080898v1/GCA_020808985.1_ASM2080898v1_genomic.fna.gz && gzip -d GCA_020808985.1_ASM2080898v1_genomic.fna.gz
-          wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/863/225/GCA_904863225.1_KSB1_6J/GCA_904863225.1_KSB1_6J_genomic.fna.gz           && gzip -d GCA_904863225.1_KSB1_6J_genomic.fna.gz
+          wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/864/595/GCA_904864595.1_INF333/GCA_904864595.1_INF333_genomic.fna.gz             && gzip -d GCA_904864595.1_INF333_genomic.fna.gz
+          wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/783/245/GCA_013783245.1_ASM1378324v1/GCA_013783245.1_ASM1378324v1_genomic.fna.gz && gzip -d GCA_013783245.1_ASM1378324v1_genomic.fna.gz
+          wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/026/626/185/GCA_026626185.1_ASM2662618v1/GCA_026626185.1_ASM2662618v1_genomic.fna.gz && gzip -d GCA_026626185.1_ASM2662618v1_genomic.fna.gz 
+          wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/808/985/GCA_020808985.1_ASM2080898v1/GCA_020808985.1_ASM2080898v1_genomic.fna.gz && gzip -d GCA_020808985.1_ASM2080898v1_genomic.fna.gz
+          wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/863/225/GCA_904863225.1_KSB1_6J/GCA_904863225.1_KSB1_6J_genomic.fna.gz           && gzip -d GCA_904863225.1_KSB1_6J_genomic.fna.gz
           
           mkdir fastas
           mv *fna fastas/.
 
           nextflow run . -profile docker,just_msa --maxcpus 2 --medcpus 2
+
+      - name: Check MSA files
+        run: |
+          for file in grandeur/roary/summary_statistics.txt grandeur/iqtree2/iqtree.treefile.nwk grandeur/snp-dists/snp_matrix_with_qc.txt
+          do
+            head $file
+            wc -l $file
+          done
diff --git a/.github/workflows/legionella.yml b/.github/workflows/legionella.yml
@@ -40,4 +40,11 @@ jobs:
           
           cat grandeur/grandeur_summary.tsv
 
-          cat grandeur/legsta/legsta_summary.csv
+      - name: Check Legionella file
+        run: |
+          for file in grandeur/legsta/legsta_summary.csv
+          do
+            head $file
+            wc -l $file
+          done
+  
diff --git a/.github/workflows/run_workflow.yml b/.github/workflows/run_workflow.yml
@@ -31,4 +31,11 @@ jobs:
           mv *fastq.gz reads/.
           nextflow run . -profile docker --maxcpus 2 --medcpus 2
           cat grandeur/grandeur_summary.tsv
-          
+      
+      - name: Check summary files
+        run: |
+          for file in grandeur/mlst/mlst_summary.tsv
+          do
+            head $file
+            wc -l $file
+          done
diff --git a/.github/workflows/salmonella.yml b/.github/workflows/salmonella.yml
@@ -37,5 +37,14 @@ jobs:
           done
 
           nextflow run . -profile docker --maxcpus 2 --medcpus 2
+
           cat grandeur/grandeur_summary.tsv
-          cat grandeur/seqsero2/seqsero2_results.txt
+
+      - name: Check Salmonella file
+        run: |
+          for file in grandeur/seqsero2/seqsero2_results.txt
+          do
+            head $file
+            wc -l $file
+          done
+  
diff --git a/.github/workflows/strepA.yml b/.github/workflows/strepA.yml
@@ -38,4 +38,11 @@ jobs:
 
           nextflow run . -profile docker --maxcpus 2 --medcpus 2
           cat grandeur/grandeur_summary.tsv
-          cat grandeur/emmtyper/emmtyper_summary.tsv
+
+      - name: Check Strep pneumo file
+        run: |
+          for file in grandeur/emmtyper/emmtyper_summary.tsv
+          do
+            head $file
+            wc -l $file
+          done
diff --git a/.github/workflows/strep_pneumo.yml b/.github/workflows/strep_pneumo.yml
@@ -38,4 +38,12 @@ jobs:
 
           nextflow run . -profile docker --maxcpus 2 --medcpus 2
           cat grandeur/grandeur_summary.tsv
-          cat grandeur/pbptyper/pbptyper_summary.tsv
+
+      - name: Check Strep pneumo file
+        run: |
+          for file in grandeur/pbptyper/pbptyper_summary.tsv
+          do
+            head $file
+            wc -l $file
+          done
+  
diff --git a/.github/workflows/vibrio.yml b/.github/workflows/vibrio.yml
@@ -38,4 +38,6 @@ jobs:
 
           nextflow run . -profile docker --maxcpus 2 --medcpus 2
           cat grandeur/grandeur_summary.tsv
-          grep -i vibrio grandeur/fastani/fastani_summary.csv
+
+      - name: Check Vibrio species
+        run: grep -i vibrio grandeur/fastani/fastani_summary.csv
diff --git a/bin/datasets_download.py b/bin/datasets_download.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+
+'''
+Author: Erin Young
+
+Description:
+
+This script is to get some genome accession from NCBI datasets
+
+EXAMPLE:
+python3 datasets_download.py taxon hits
+'''
+
+import subprocess
+import sys
+
+taxon = sys.argv[1]
+genus, species = taxon.replace('[', '').replace(']', '').split('_')
+print("Looking for accessions for " + genus + " " + species )
+outfile = open('datasets/' + genus + "_" + species + '_genomes.csv', "w")
+
+try:
+  hits = sys.argv[2]
+except:
+  hits = '5'
+
+# putting in the header
+outfile.write('accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len\n')
+
+# Getting representative genomes
+rep = subprocess.Popen(['datasets', 'summary', 'genome', 'taxon', '"' + genus + ' ' + species + '"', '--reference', '--limit', hits, '--as-json-lines'], stdout = subprocess.PIPE)
+dft = subprocess.check_output(['dataformat', 'tsv', 'genome', '--fields' , 'accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len'], stdin = rep.stdout, universal_newlines= True, text='str')
+rep.wait()
+for line in dft.split('\n'):
+  if 'Ungapped Length' not in line and line:
+    if int(line.split('\t')[4]) < 15000000:
+      outfile.write(line.replace('\t',',') + '\n')
+
+# Getting additional genomes
+oth = subprocess.Popen(['datasets', 'summary', 'genome', 'taxon', '"' + genus + ' ' + species + '"', '--limit', hits, '--as-json-lines'], stdout = subprocess.PIPE)
+df2 = subprocess.check_output(['dataformat', 'tsv', 'genome', '--fields' , 'accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len'], stdin = oth.stdout, universal_newlines= True, text='str')
+oth.wait()
+for line in df2.split('\n'):
+  if 'Ungapped Length' not in line and line:
+    if int(line.split('\t')[4]) < 15000000:
+      outfile.write(line.replace('\t',',') + '\n')
+
+outfile.close()
diff --git a/bin/summary_file.py b/bin/summary_file.py
@@ -0,0 +1,76 @@
+#!/bin/python3
+
+##########################################
+# written by Erin Young                  #
+# for creating summary files with the    #
+# sample id for Grandeur                 #
+##########################################
+
+import os
+import sys
+
+out = sys.argv[2]
+spl = sys.argv[4]
+
+if not os.path.exists(sys.argv[1]):
+    print("File " + sys.argv[1] + " does not exist. Exiting.")
+    exit()
+
+coms = 0
+tabs = 0
+with open(sys.argv[1]) as file:
+    first_line = file.readline()
+    coms = first_line.count('\t')
+    tabs = first_line.count('\t')
+
+if tabs > coms:
+    delim = '\t'
+    print("Predicting tab delimited")
+else:
+    delim = ','
+    print("Predicting comma delimited")
+
+with open(sys.argv[1], 'r') as file:
+    lines = file.readlines()
+    for line in lines:
+        print(line)
+        print(line.split(delim))
+
+outfile = open(sys.argv[2], "w")
+
+final_delim = ','
+header = 'shouldntexist'
+
+# TODO: turn this into a dict
+if sys.argv[3] == 'mlst':
+    final_delim = '\t'
+    header = 'PubMLST'
+    outfile.write('sample\tfilename\tmatching PubMLST scheme\tST\tID1\tID2\tID3\tID4\tID5\tID6\tID7\tID8\tID9\tID10\tID11\tID12\tID13\tID14\tID15\n')
+elif sys.argv[3] == 'shigatyper':
+    final_delim = '\t'
+    header      = 'Number'
+elif sys.argv[3] == 'kleborate':
+    final_delim = '\t'
+    header = 'largest_contig'
+elif sys.argv[3] == 'plasmidfinder' :
+    final_delim = '\t'
+    header = 'Accession number'
+elif sys.argv[3] == 'emmtyper':
+    outfile.write('sample\tIsolate name\tNumber of BLAST hits\tNumber of clusters\tPredicted emm-type\tPosition(s)\tPossible emm-like alleles\temm-like position(s)\tEMM cluster\n')
+    final_delim = '\t'
+    header = 'Number of BLAST hits'
+elif sys.argv[3] == 'serotypefinder':
+    final_delim = '\t'
+    header = 'HSP length'
+
+print("Using final delim " + final_delim + " with sample " + spl + " for " + sys.argv[3])
+
+with open(sys.argv[1]) as file:
+    lines = file.readlines()
+    for line in lines:
+        if header in line:
+            replace = line.replace(delim, final_delim)
+            outfile.write('sample' + final_delim + replace)
+        else:
+            replace = line.replace(delim, final_delim)
+            outfile.write(spl + final_delim + replace)
diff --git a/grandeur.nf b/grandeur.nf
@@ -146,8 +146,10 @@ include { test }                        from "./subworkflows/test"
 // ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
 
 // Creating the summary files
-summary_script = Channel.fromPath(workflow.projectDir + "/bin/summary.py",     type: "file")
-snpmtrx_script = Channel.fromPath(workflow.projectDir + "/bin/HeatCluster.py", type: "file")
+dataset_script = Channel.fromPath(workflow.projectDir + "/bin/datasets_download.py", type: "file")
+snpmtrx_script = Channel.fromPath(workflow.projectDir + "/bin/HeatCluster.py",       type: "file")
+summary_script = Channel.fromPath(workflow.projectDir + "/bin/summary.py",           type: "file")
+summfle_script = Channel.fromPath(workflow.projectDir + "/bin/summary_file.py",      type: "file")
 
 // ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
 
@@ -318,7 +320,8 @@ workflow {
       ch_for_summary.collect(),
       ch_contigs,
       ch_fastani_genomes,
-      ch_genome_ref)
+      ch_genome_ref,
+      dataset_script)
 
     ch_for_summary = ch_for_summary.mix(average_nucleotide_identity.out.for_summary)
     ch_for_flag    = ch_for_flag.mix(average_nucleotide_identity.out.for_flag)
@@ -341,7 +344,8 @@ workflow {
       ch_raw_reads, 
       ch_contigs, 
       ch_for_flag, 
-      ch_size)
+      ch_size,
+      summfle_script)
 
     ch_for_summary = ch_for_summary.mix(information.out.for_summary)
     ch_for_multiqc = ch_for_multiqc.mix(information.out.for_multiqc)

diff --git a/modules/blobtools.nf b/modules/blobtools.nf
@@ -125,4 +125,4 @@ process blobtools_plot {
 
     grep -vw all blobtools/!{sample}_summary.txt > blobtools/!{sample}_blobtools.txt
   '''
-}
+}
diff --git a/modules/datasets.nf b/modules/datasets.nf
@@ -1,7 +1,7 @@
 process datasets_summary {
   tag           "${taxon}"
   publishDir    params.outdir, mode: 'copy'
-  container     'staphb/ncbi-datasets:15.2.0'
+  container     'quay.io/uphl/datasets:15.12.0'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
@@ -10,10 +10,10 @@ process datasets_summary {
   //#UPHLICA time '10m'
 
   input:
-  val(taxon)
+  tuple val(taxon), file(script)
 
   output:
-  path "datasets/*_genomes.csv"                                 , emit: genomes
+  path "datasets/*_genomes.csv"                          , emit: genomes, optional: true
   path "logs/${task.process}/*.${workflow.sessionId}.log", emit: log
 
   shell:
@@ -28,32 +28,18 @@ process datasets_summary {
     echo "Nextflow command : " >> $log_file
     cat .command.sh >> $log_file
 
-    taxon="$(echo !{taxon} | tr '_' ' ' | sed 's/[//g' | sed 's/]//g' )"
-    echo "the taxon is now $taxon"
-
-    datasets summary genome taxon "$taxon" --reference --limit !{params.datasets_max_genomes} --as-json-lines | \
-      dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \
-      grep -v Homo | \
-      tr '\\t' ',' \
-      > datasets/!{taxon}_genomes.csv
-
-    datasets summary genome taxon "$taxon" --limit !{params.datasets_max_genomes} --as-json-lines | \
-      dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \
-      grep -v Homo | \
-      grep -v "Assembly Accession" | \
-      tr '\\t' ',' \
-      >> datasets/!{taxon}_genomes.csv
+    python3 !{script} !{taxon} !{params.datasets_max_genomes}
   '''
 }
 
-// It is faster if datasets can download the entire list at a time, but there is a timeout for downloading that is about 20 minutes.
+// It is faster if datasets can download the entire list at a time, but there is a 20 minute timeout for downloading.
 // The '||' is to allow each genome to be downloaded on its own, which is longer overall but each genome should be less than 20 minutes.
 process datasets_download {
   tag           "Downloading Genomes"
   // because there's no way to specify threads
   label         "medcpus"
   publishDir = [ path: "${params.outdir}", mode: 'copy', pattern: "{logs/*/*log,datasets/fastani_refs.tar.gz}" ]
-  container     'staphb/ncbi-datasets:15.2.0'
+  container     'quay.io/uphl/datasets:15.12.0'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
@@ -83,7 +69,7 @@ process datasets_download {
     cat .command.sh >> $log_file
 
     cut -f 1 !{genomes} > all_runs.txt
-    grep -h -v Accession !{ids} | cut -f 1 -d , | sort | uniq > this_run.txt
+    grep -h -v accession !{ids} | cut -f 1 -d , | sort | uniq > this_run.txt
 
     cat all_runs.txt this_run.txt | sort | uniq > id_list.txt
 

diff --git a/modules/emmtyper.nf b/modules/emmtyper.nf
@@ -10,11 +10,11 @@ process emmtyper {
   //#UPHLICA cpus 3
   //#UPHLICA time '24h'
 
-    when:
-    flag =~ 'found'
+  when:
+  flag =~ 'found'
 
   input:
-  tuple val(sample), file(contigs), val(flag)
+  tuple val(sample), file(contigs), val(flag), file(script)
 
   output:
   path "emmtyper/${sample}_emmtyper.txt"                         , emit: collect
@@ -34,12 +34,12 @@ process emmtyper {
     echo "Nextflow command : " >> $log_file
     cat .command.sh >> $log_file
     
-    echo -e "sample\\tIsolate name\\tNumber of BLAST hits\\tNumber of clusters\\tPredicted emm-type\\tPosition(s)\\tPossible emm-like alleles\\temm-like position(s)\\tEMM cluster" > emmtyper/!{sample}_emmtyper.txt
-
     emmtyper !{params.emmtyper_options} \
       --output-format 'verbose' \
       !{contigs} \
       | tee -a $log_file \
-      | awk -v sample=!{sample} '{ print sample "\\t" $0 }' >> emmtyper/!{sample}_emmtyper.txt
+      > !{sample}_emmtyper.txt
+
+    python3 !{script} !{sample}_emmtyper.txt emmtyper/!{sample}_emmtyper.txt emmtyper !{sample}
   '''
 }