diff --git a/.github/workflows/datasets.yml b/.github/workflows/datasets.yml new file mode 100644 index 0000000..9ebc15b --- /dev/null +++ b/.github/workflows/datasets.yml @@ -0,0 +1,59 @@ +name: Test datasets + +on: + workflow_dispatch: + schedule: + - cron: '30 4 * * TUE' + +jobs: + check: + runs-on: ubuntu-20.04 + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: download datasets + run: | + DATASETS_VER=$(grep container modules/datasets.nf | sort | uniq | head -n 1 | awk '{print $2}' | sed 's/'\''//g' | cut -f 2 -d ":") + + RUN wget https://github.com/ncbi/datasets/releases/download/v${DATASETS_VER}/linux-amd64.cli.package.zip && \ + unzip linux-amd64.cli.package.zip && \ + rm linux-amd64.cli.package.zip && \ + chmod +x dataformat datasets + + - name: get accessions + run: | + taxons=("Acinetobacter baumannii", + "Burkholderia cenocepacia", + "Campylobacter jejuni", + "Citrobacter sp", + "Elizabethkingia meningoseptica", + "Escherichia coli", + "Klebsiella oxytoca", + "Legionella pneumophila", + "Pseudomonas sp", + "Raoultella ornithinolytica", + "Salmonella enterica") + + for taxon in ${taxons[@]} + do + echo "the taxon is now $taxon" + organism=$(echo $taxon | sed 's/ /_/g') + + echo "Getting the representative genome" + ./datasets summary genome taxon "$taxon" --reference --limit 5 --as-json-lines | \ + dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \ + grep -v Homo | \ + tr '\\t' ',' \ + | tee ${organism}_genomes.csv + + echo "Getting additional genomes" + ./datasets summary genome taxon "$taxon" --limit 5 --as-json-lines | \ + dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \ + grep -v Homo | \ + grep -v "Assembly Accession" | \ + tr '\\t' ',' \ + | tee -a ${organism}_genomes.csv + done + + head *genomes.csv