Skip to content

Test datasets

Test datasets #7

Workflow file for this run

name: Test datasets
on:
workflow_dispatch:
schedule:
- cron: '30 4 * * TUE'
jobs:
check:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v3
- name: download datasets
run: |
grep -h container modules/datasets.nf
grep -h container modules/datasets.nf | sort | uniq
grep -h container modules/datasets.nf | sort | uniq | awk '{print $2}'
grep -h container modules/datasets.nf | sort | uniq | head -n 1 | awk '{print $2}' | cut -f 2 -d ":"
DATASETS_VER=$(grep -h container modules/datasets.nf \
| grep datasets \
| sort \
| uniq \
| head -n 1 \
| awk '{print $2}' \
| cut -f 2 -d ":" \
| sed 's/'\''//g')
echo "The datasets version is $DATASETS_VER"
wget https://github.com/ncbi/datasets/releases/download/v${DATASETS_VER}/linux-amd64.cli.package.zip && \
unzip linux-amd64.cli.package.zip && \
rm linux-amd64.cli.package.zip && \
chmod +x dataformat datasets
- name: get accessions
run: |
taxons=("Acinetobacter baumannii",
"Burkholderia cenocepacia",
"Campylobacter jejuni",
"Citrobacter sp",
"Elizabethkingia meningoseptica",
"Escherichia coli",
"Klebsiella oxytoca",
"Legionella pneumophila",
"Pseudomonas sp",
"Raoultella ornithinolytica",
"Salmonella enterica")
for taxon in ${taxons[@]}
do
echo "the taxon is now $taxon"
organism=$(echo $taxon | sed 's/ /_/g')
echo "Getting the representative genome"
./datasets summary genome taxon "$taxon" --reference --limit 5 --as-json-lines | \
dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \
grep -v Homo | \
tr '\\t' ',' \
| tee ${organism}_genomes.csv
echo "Getting additional genomes"
./datasets summary genome taxon "$taxon" --limit 5 --as-json-lines | \
dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \
grep -v Homo | \
grep -v "Assembly Accession" | \
tr '\\t' ',' \
| tee -a ${organism}_genomes.csv
done
head *genomes.csv