Skip to content

Commit

Permalink
Merge pull request #99 from UPHL-BioNGS/update20230802
Browse files Browse the repository at this point in the history
Update20230802
  • Loading branch information
erinyoung committed Aug 4, 2023
2 parents cf9e235 + d8c43ec commit 01a39f2
Show file tree
Hide file tree
Showing 26 changed files with 268 additions and 103 deletions.
11 changes: 9 additions & 2 deletions .github/workflows/ecoli.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,12 @@ jobs:
nextflow run . -profile docker --maxcpus 2 --medcpus 2
cat grandeur/grandeur_summary.tsv
cat grandeur/shigatyper/shigatyper_results.txt
cat grandeur/serotypefinder/serotypefinder_results.txt
- name: Check E. coli file
run: |
for file in grandeur/shigatyper/shigatyper_results.txt grandeur/serotypefinder/serotypefinder_results.txt
do
head $file
wc -l $file
done
18 changes: 13 additions & 5 deletions .github/workflows/just_msa.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,21 @@ jobs:
run: |
docker --version
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/864/595/GCA_904864595.1_INF333/GCA_904864595.1_INF333_genomic.fna.gz && gzip -d GCA_904864595.1_INF333_genomic.fna.gz
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/783/245/GCA_013783245.1_ASM1378324v1/GCA_013783245.1_ASM1378324v1_genomic.fna.gz && gzip -d GCA_013783245.1_ASM1378324v1_genomic.fna.gz
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/026/626/185/GCA_026626185.1_ASM2662618v1/GCA_026626185.1_ASM2662618v1_genomic.fna.gz && gzip -d GCA_026626185.1_ASM2662618v1_genomic.fna.gz
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/808/985/GCA_020808985.1_ASM2080898v1/GCA_020808985.1_ASM2080898v1_genomic.fna.gz && gzip -d GCA_020808985.1_ASM2080898v1_genomic.fna.gz
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/863/225/GCA_904863225.1_KSB1_6J/GCA_904863225.1_KSB1_6J_genomic.fna.gz && gzip -d GCA_904863225.1_KSB1_6J_genomic.fna.gz
wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/864/595/GCA_904864595.1_INF333/GCA_904864595.1_INF333_genomic.fna.gz && gzip -d GCA_904864595.1_INF333_genomic.fna.gz
wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/783/245/GCA_013783245.1_ASM1378324v1/GCA_013783245.1_ASM1378324v1_genomic.fna.gz && gzip -d GCA_013783245.1_ASM1378324v1_genomic.fna.gz
wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/026/626/185/GCA_026626185.1_ASM2662618v1/GCA_026626185.1_ASM2662618v1_genomic.fna.gz && gzip -d GCA_026626185.1_ASM2662618v1_genomic.fna.gz
wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/808/985/GCA_020808985.1_ASM2080898v1/GCA_020808985.1_ASM2080898v1_genomic.fna.gz && gzip -d GCA_020808985.1_ASM2080898v1_genomic.fna.gz
wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/863/225/GCA_904863225.1_KSB1_6J/GCA_904863225.1_KSB1_6J_genomic.fna.gz && gzip -d GCA_904863225.1_KSB1_6J_genomic.fna.gz
mkdir fastas
mv *fna fastas/.
nextflow run . -profile docker,just_msa --maxcpus 2 --medcpus 2
- name: Check MSA files
run: |
for file in grandeur/roary/summary_statistics.txt grandeur/iqtree2/iqtree.treefile.nwk grandeur/snp-dists/snp_matrix_with_qc.txt
do
head $file
wc -l $file
done
9 changes: 8 additions & 1 deletion .github/workflows/legionella.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,11 @@ jobs:
cat grandeur/grandeur_summary.tsv
cat grandeur/legsta/legsta_summary.csv
- name: Check Legionella file
run: |
for file in grandeur/legsta/legsta_summary.csv
do
head $file
wc -l $file
done
9 changes: 8 additions & 1 deletion .github/workflows/run_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,11 @@ jobs:
mv *fastq.gz reads/.
nextflow run . -profile docker --maxcpus 2 --medcpus 2
cat grandeur/grandeur_summary.tsv
- name: Check summary files
run: |
for file in grandeur/mlst/mlst_summary.tsv
do
head $file
wc -l $file
done
11 changes: 10 additions & 1 deletion .github/workflows/salmonella.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,14 @@ jobs:
done
nextflow run . -profile docker --maxcpus 2 --medcpus 2
cat grandeur/grandeur_summary.tsv
cat grandeur/seqsero2/seqsero2_results.txt
- name: Check Salmonella file
run: |
for file in grandeur/seqsero2/seqsero2_results.txt
do
head $file
wc -l $file
done
9 changes: 8 additions & 1 deletion .github/workflows/strepA.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,11 @@ jobs:
nextflow run . -profile docker --maxcpus 2 --medcpus 2
cat grandeur/grandeur_summary.tsv
cat grandeur/emmtyper/emmtyper_summary.tsv
- name: Check Strep pneumo file
run: |
for file in grandeur/emmtyper/emmtyper_summary.tsv
do
head $file
wc -l $file
done
10 changes: 9 additions & 1 deletion .github/workflows/strep_pneumo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,12 @@ jobs:
nextflow run . -profile docker --maxcpus 2 --medcpus 2
cat grandeur/grandeur_summary.tsv
cat grandeur/pbptyper/pbptyper_summary.tsv
- name: Check Strep pneumo file
run: |
for file in grandeur/pbptyper/pbptyper_summary.tsv
do
head $file
wc -l $file
done
4 changes: 3 additions & 1 deletion .github/workflows/vibrio.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,6 @@ jobs:
nextflow run . -profile docker --maxcpus 2 --medcpus 2
cat grandeur/grandeur_summary.tsv
grep -i vibrio grandeur/fastani/fastani_summary.csv
- name: Check Vibrio species
run: grep -i vibrio grandeur/fastani/fastani_summary.csv
48 changes: 48 additions & 0 deletions bin/datasets_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env python3

'''
Author: Erin Young
Description:
This script is to get some genome accession from NCBI datasets
EXAMPLE:
python3 datasets_download.py taxon hits
'''

import subprocess
import sys

taxon = sys.argv[1]
genus, species = taxon.replace('[', '').replace(']', '').split('_')
print("Looking for accessions for " + genus + " " + species )
outfile = open('datasets/' + genus + "_" + species + '_genomes.csv', "w")

try:
hits = sys.argv[2]
except:
hits = '5'

# putting in the header
outfile.write('accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len\n')

# Getting representative genomes
rep = subprocess.Popen(['datasets', 'summary', 'genome', 'taxon', '"' + genus + ' ' + species + '"', '--reference', '--limit', hits, '--as-json-lines'], stdout = subprocess.PIPE)
dft = subprocess.check_output(['dataformat', 'tsv', 'genome', '--fields' , 'accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len'], stdin = rep.stdout, universal_newlines= True, text='str')
rep.wait()
for line in dft.split('\n'):
if 'Ungapped Length' not in line and line:
if int(line.split('\t')[4]) < 15000000:
outfile.write(line.replace('\t',',') + '\n')

# Getting additional genomes
oth = subprocess.Popen(['datasets', 'summary', 'genome', 'taxon', '"' + genus + ' ' + species + '"', '--limit', hits, '--as-json-lines'], stdout = subprocess.PIPE)
df2 = subprocess.check_output(['dataformat', 'tsv', 'genome', '--fields' , 'accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len'], stdin = oth.stdout, universal_newlines= True, text='str')
oth.wait()
for line in df2.split('\n'):
if 'Ungapped Length' not in line and line:
if int(line.split('\t')[4]) < 15000000:
outfile.write(line.replace('\t',',') + '\n')

outfile.close()
76 changes: 76 additions & 0 deletions bin/summary_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/python3

##########################################
# written by Erin Young #
# for creating summary files with the #
# sample id for Grandeur #
##########################################

import os
import sys

out = sys.argv[2]
spl = sys.argv[4]

if not os.path.exists(sys.argv[1]):
print("File " + sys.argv[1] + " does not exist. Exiting.")
exit()

coms = 0
tabs = 0
with open(sys.argv[1]) as file:
first_line = file.readline()
coms = first_line.count('\t')
tabs = first_line.count('\t')

if tabs > coms:
delim = '\t'
print("Predicting tab delimited")
else:
delim = ','
print("Predicting comma delimited")

with open(sys.argv[1], 'r') as file:
lines = file.readlines()
for line in lines:
print(line)
print(line.split(delim))

outfile = open(sys.argv[2], "w")

final_delim = ','
header = 'shouldntexist'

# TODO: turn this into a dict
if sys.argv[3] == 'mlst':
final_delim = '\t'
header = 'PubMLST'
outfile.write('sample\tfilename\tmatching PubMLST scheme\tST\tID1\tID2\tID3\tID4\tID5\tID6\tID7\tID8\tID9\tID10\tID11\tID12\tID13\tID14\tID15\n')
elif sys.argv[3] == 'shigatyper':
final_delim = '\t'
header = 'Number'
elif sys.argv[3] == 'kleborate':
final_delim = '\t'
header = 'largest_contig'
elif sys.argv[3] == 'plasmidfinder' :
final_delim = '\t'
header = 'Accession number'
elif sys.argv[3] == 'emmtyper':
outfile.write('sample\tIsolate name\tNumber of BLAST hits\tNumber of clusters\tPredicted emm-type\tPosition(s)\tPossible emm-like alleles\temm-like position(s)\tEMM cluster\n')
final_delim = '\t'
header = 'Number of BLAST hits'
elif sys.argv[3] == 'serotypefinder':
final_delim = '\t'
header = 'HSP length'

print("Using final delim " + final_delim + " with sample " + spl + " for " + sys.argv[3])

with open(sys.argv[1]) as file:
lines = file.readlines()
for line in lines:
if header in line:
replace = line.replace(delim, final_delim)
outfile.write('sample' + final_delim + replace)
else:
replace = line.replace(delim, final_delim)
outfile.write(spl + final_delim + replace)
12 changes: 8 additions & 4 deletions grandeur.nf
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,10 @@ include { test } from "./subworkflows/test"
// ##### ##### ##### ##### ##### ##### ##### ##### ##### #####

// Creating the summary files
summary_script = Channel.fromPath(workflow.projectDir + "/bin/summary.py", type: "file")
snpmtrx_script = Channel.fromPath(workflow.projectDir + "/bin/HeatCluster.py", type: "file")
dataset_script = Channel.fromPath(workflow.projectDir + "/bin/datasets_download.py", type: "file")
snpmtrx_script = Channel.fromPath(workflow.projectDir + "/bin/HeatCluster.py", type: "file")
summary_script = Channel.fromPath(workflow.projectDir + "/bin/summary.py", type: "file")
summfle_script = Channel.fromPath(workflow.projectDir + "/bin/summary_file.py", type: "file")

// ##### ##### ##### ##### ##### ##### ##### ##### ##### #####

Expand Down Expand Up @@ -318,7 +320,8 @@ workflow {
ch_for_summary.collect(),
ch_contigs,
ch_fastani_genomes,
ch_genome_ref)
ch_genome_ref,
dataset_script)

ch_for_summary = ch_for_summary.mix(average_nucleotide_identity.out.for_summary)
ch_for_flag = ch_for_flag.mix(average_nucleotide_identity.out.for_flag)
Expand All @@ -341,7 +344,8 @@ workflow {
ch_raw_reads,
ch_contigs,
ch_for_flag,
ch_size)
ch_size,
summfle_script)

ch_for_summary = ch_for_summary.mix(information.out.for_summary)
ch_for_multiqc = ch_for_multiqc.mix(information.out.for_multiqc)
Expand Down
2 changes: 1 addition & 1 deletion modules/blobtools.nf
Original file line number Diff line number Diff line change
Expand Up @@ -125,4 +125,4 @@ process blobtools_plot {
grep -vw all blobtools/!{sample}_summary.txt > blobtools/!{sample}_blobtools.txt
'''
}
}
28 changes: 7 additions & 21 deletions modules/datasets.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
process datasets_summary {
tag "${taxon}"
publishDir params.outdir, mode: 'copy'
container 'staphb/ncbi-datasets:15.2.0'
container 'quay.io/uphl/datasets:15.12.0'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
Expand All @@ -10,10 +10,10 @@ process datasets_summary {
//#UPHLICA time '10m'

input:
val(taxon)
tuple val(taxon), file(script)

output:
path "datasets/*_genomes.csv" , emit: genomes
path "datasets/*_genomes.csv" , emit: genomes, optional: true
path "logs/${task.process}/*.${workflow.sessionId}.log", emit: log

shell:
Expand All @@ -28,32 +28,18 @@ process datasets_summary {
echo "Nextflow command : " >> $log_file
cat .command.sh >> $log_file
taxon="$(echo !{taxon} | tr '_' ' ' | sed 's/[//g' | sed 's/]//g' )"
echo "the taxon is now $taxon"
datasets summary genome taxon "$taxon" --reference --limit !{params.datasets_max_genomes} --as-json-lines | \
dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \
grep -v Homo | \
tr '\\t' ',' \
> datasets/!{taxon}_genomes.csv
datasets summary genome taxon "$taxon" --limit !{params.datasets_max_genomes} --as-json-lines | \
dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \
grep -v Homo | \
grep -v "Assembly Accession" | \
tr '\\t' ',' \
>> datasets/!{taxon}_genomes.csv
python3 !{script} !{taxon} !{params.datasets_max_genomes}
'''
}

// It is faster if datasets can download the entire list at a time, but there is a timeout for downloading that is about 20 minutes.
// It is faster if datasets can download the entire list at a time, but there is a 20 minute timeout for downloading.
// The '||' is to allow each genome to be downloaded on its own, which is longer overall but each genome should be less than 20 minutes.
process datasets_download {
tag "Downloading Genomes"
// because there's no way to specify threads
label "medcpus"
publishDir = [ path: "${params.outdir}", mode: 'copy', pattern: "{logs/*/*log,datasets/fastani_refs.tar.gz}" ]
container 'staphb/ncbi-datasets:15.2.0'
container 'quay.io/uphl/datasets:15.12.0'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
Expand Down Expand Up @@ -83,7 +69,7 @@ process datasets_download {
cat .command.sh >> $log_file
cut -f 1 !{genomes} > all_runs.txt
grep -h -v Accession !{ids} | cut -f 1 -d , | sort | uniq > this_run.txt
grep -h -v accession !{ids} | cut -f 1 -d , | sort | uniq > this_run.txt
cat all_runs.txt this_run.txt | sort | uniq > id_list.txt
Expand Down
12 changes: 6 additions & 6 deletions modules/emmtyper.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ process emmtyper {
//#UPHLICA cpus 3
//#UPHLICA time '24h'

when:
flag =~ 'found'
when:
flag =~ 'found'

input:
tuple val(sample), file(contigs), val(flag)
tuple val(sample), file(contigs), val(flag), file(script)

output:
path "emmtyper/${sample}_emmtyper.txt" , emit: collect
Expand All @@ -34,12 +34,12 @@ process emmtyper {
echo "Nextflow command : " >> $log_file
cat .command.sh >> $log_file
echo -e "sample\\tIsolate name\\tNumber of BLAST hits\\tNumber of clusters\\tPredicted emm-type\\tPosition(s)\\tPossible emm-like alleles\\temm-like position(s)\\tEMM cluster" > emmtyper/!{sample}_emmtyper.txt
emmtyper !{params.emmtyper_options} \
--output-format 'verbose' \
!{contigs} \
| tee -a $log_file \
| awk -v sample=!{sample} '{ print sample "\\t" $0 }' >> emmtyper/!{sample}_emmtyper.txt
> !{sample}_emmtyper.txt
python3 !{script} !{sample}_emmtyper.txt emmtyper/!{sample}_emmtyper.txt emmtyper !{sample}
'''
}
Loading

0 comments on commit 01a39f2

Please sign in to comment.