diff --git a/README.md b/README.md index 34ae7091..a1cb2860 100644 --- a/README.md +++ b/README.md @@ -16,24 +16,25 @@ staramr search -o out --pointfinder-organism salmonella *.fasta **out/summary.tsv**: -| Isolate ID | Genotype | Predicted Phenotype | Plasmid | -|------------|-----------------------------------------------------------|-----------------------------------------------------------------------------------------------------------| ------------------------------------| -| SRR1952908 | aadA1, aadA2, blaTEM-57, cmlA1, gyrA (S83Y), sul3, tet(A) | streptomycin, ampicillin, chloramphenicol, ciprofloxacin I/R, nalidixic acid, sulfisoxazole, tetracycline | ColpVC, IncFIB(S), IncFII(S), IncI1 | -| SRR1952926 | blaTEM-57, gyrA (S83Y), tet(A) | ampicillin, ciprofloxacin I/R, nalidixic acid, tetracycline | ColpVC, IncFIB(S), IncFII(S), IncI1 | +| Isolate ID | Genotype | Predicted Phenotype | Plasmid | Scheme | Sequence Type | +|------------|-----------------------------------------------------------|-----------------------------------------------------------------------------------------------------------|-------------------------------------|-----------|---------------| +| SRR1952908 | aadA1, aadA2, blaTEM-57, cmlA1, gyrA (S83Y), sul3, tet(A) | streptomycin, ampicillin, chloramphenicol, ciprofloxacin I/R, nalidixic acid, sulfisoxazole, tetracycline | ColpVC, IncFIB(S), IncFII(S), IncI1 | senterica | 11 | +| SRR1952926 | blaTEM-57, gyrA (S83Y), tet(A) | ampicillin, ciprofloxacin I/R, nalidixic acid, tetracycline | ColpVC, IncFIB(S), IncFII(S), IncI1 | senterica | 11 | **out/detailed_summary.tsv**: -| Isolate ID | Gene/Plasmid | Predicted Phenotype | %Identity | %Overlap | HSP Length/Total Length | Contig | Start | End | Accession | Data Type | -|------------|--------------|-----------------------------------|-----------|----------|-------------------------|-------------|--------|--------|-----------|------------| -| SRR1952908 | ColpVC | | 98.96 | 100 | 193/193 | contig00038 | 1618 | 1426 | JX133088 | Plasmid | -| SRR1952908 | sul3 | sulfisoxazole | 100 | 100 | 792/792 | contig00030 | 2091 | 2882 | AJ459418 | Resistance | +| Isolate ID | Data | Data Type | Predicted Phenotype | %Identity | %Overlap | HSP Length/Total Length | Contig | Start | End | Accession | +|------------|------------------|------------|---------------------|-----------|----------|-------------------------|-------------|-------|------|-----------| +| SRR1952908 | ST11 (senterica) | MLST | | | | | | | | | +| SRR1952908 | ColpVC | Plasmid | | 98.96 | 100 | 193/193 | contig00038 | 1618 | 1426 | JX133088 | +| SRR1952908 | aadA1 | Resistance | streptomycin | 100 | 100 | 792/792 | contig00030 | 5355 | 4564 | JQ414041 | **out/resfinder.tsv**: -| Isolate ID | Gene | Predicted Phenotype | %Identity | %Overlap | HSP Length/Total Length | Contig | Start | End | Accession | -|------------|------------|----------------------|------------|-----------|--------------------------|--------------|--------|-------|-----------| -| SRR1952908 | sul3 | sulfisoxazole | 100.00 | 100.00 | 792/792 | contig00030 | 2091 | 2882 | AJ459418 | -| SRR1952908 | tet(A) | tetracycline | 99.92 | 100.00 | 1200/1200 | contig00032 | 1551 | 2750 | AJ517790 | +| Isolate ID | Gene | Predicted Phenotype | %Identity | %Overlap | HSP Length/Total Length | Contig | Start | End | Accession | +|------------|--------|---------------------|-----------|----------|-------------------------|-------------|-------|------|-----------| +| SRR1952908 | sul3 | sulfisoxazole | 100 | 100 | 792/792 | contig00030 | 2091 | 2882 | AJ459418 | +| SRR1952908 | tet(A) | tetracycline | 99.92 | 97.8 | 1247/1275 | contig00032 | 1476 | 2722 | AF534183 | **out/pointfinder.tsv**: @@ -49,6 +50,13 @@ staramr search -o out --pointfinder-organism salmonella *.fasta | SRR1952908 | ColpVC | 98.96 | 100 | 193/193 | contig00038 | 1618 | 1426 | JX133088 | | SRR1952908 | IncFIB(S) | 98.91 | 100 | 643/643 | contig00024 | 10302 | 9660 | FN432031 | +**out/mlst.tsv**: + +| Isolate ID | Scheme | Sequence Type | Locus 1 | Locus 2 | Locus 3 | Locus 4 | Locus 5 | Locus 6 | Locus 7 | +|------------|-----------|---------------|---------|---------|---------|---------|---------|---------|----------| +| SRR1952908 | senterica | 11 | aroC(5) | dnaN(2) | hemD(3) | hisD(7) | purE(6) | sucA(6) | thrA(11) | +| SRR1952926 | senterica | 11 | aroC(5) | dnaN(2) | hemD(3) | hisD(7) | purE(6) | sucA(6) | thrA(11) | + # Table of Contents - [Quick Usage](#quick-usage) @@ -67,6 +75,7 @@ staramr search -o out --pointfinder-organism salmonella *.fasta * [resfinder.tsv](#resfindertsv) * [pointfinder.tsv](#pointfindertsv) * [plasmidfinder.tsv](#plasmidfindertsv) + * [mlst.tsv](#mlsttsv) * [settings.txt](#settingstxt) * [hits/](#hits) - [Tutorial](#tutorial) @@ -109,6 +118,14 @@ staramr search --plasmidfinder-database-type enterobacteriaceae -o out *.fasta ``` Where `--plasmidfinder-database-type` is the specific database type you are interested in (currently only *gram_positive*, *enterobacteriaceae* are supported). By default, both databases are used. +To specify which MLST scheme to use, please run: + +```bash +staramr search -o out --mlst-scheme senterica *.fasta +``` + +Where `--mlst-scheme` is the specific organism you are interested in (please visit the [scheme genus map](https://github.com/tseemann/mlst/blob/master/db/scheme_species_map.tab) to see which are available). By default, it detects the scheme automatically. + ## Database Info To print information about the installed databases, please run: @@ -199,7 +216,7 @@ source .venv/bin/activate pip install -e . # Now run `staramr` -staramr +staramr ``` Due to the way we packaged the ResFinder/PointFinder/PlasmidFinder databases, the development code will not come with a default database. You must first build the database before usage. E.g. @@ -213,6 +230,7 @@ staramr db restore-default * Python 3.5+ * BLAST+ * Git +* MLST # Input @@ -230,15 +248,16 @@ Please make sure to include `#gene_id` in the first line. The default exclusion # Output -There are 7 different output files produced by `staramr`: +There are 8 different output files produced by `staramr`: -1. `summary.tsv`: A summary of all detected AMR genes/mutations/plasmids in each genome, one genome per line. -2. `detailed_summary.tsv`: A detailed summary of all detected AMR genes/mutations/plasmids in each genome, one gene per line. +1. `summary.tsv`: A summary of all detected AMR genes/mutations/plasmids/sequence type in each genome, one genome per line. +2. `detailed_summary.tsv`: A detailed summary of all detected AMR genes/mutations/plasmids/sequence type in each genome, one gene per line. 3. `resfinder.tsv`: A tabular file of each AMR gene and additional BLAST information from the **ResFinder** database, one gene per line. 4. `pointfinder.tsv`: A tabular file of each AMR point mutation and additional BLAST information from the **PointFinder** database, one gene per line. 5. `plasmidfinder.tsv`: A tabular file of each AMR plasmid type and additional BLAST information from the **PlasmidFinder** database, one plasmid type per line. -6. `settings.txt`: The command-line, database versions, and other settings used to run `staramr`. -7. `results.xlsx`: An Excel spreadsheet containing the previous 6 files as separate worksheets. +6. `mlst.tsv`: A tabular file of each multi-locus sequence type (MLST) and it's corresponding locus/alleles, one genome per line. +7. `settings.txt`: The command-line, database versions, and other settings used to run `staramr`. +8. `results.xlsx`: An Excel spreadsheet containing the previous 6 files as separate worksheets. In addition, the directory `hits/` stores fasta files of the specific blast hits. @@ -250,20 +269,23 @@ The **summary.tsv** output file generated by `staramr` contains the following co * __Genotype__: The AMR genotype of the isolate. * __Predicted Phenotype__: The predicted AMR phenotype (drug resistances) for the isolate. * __Plasmid__: Plasmid types that were found for the isolate. +* __Scheme__: The MLST scheme used +* __Sequence Type__: The sequence type that's assigned when combining all allele types ### Example -| Isolate ID | Genotype | Predicted Phenotype | Plasmid | -|------------|-----------------------------------------------------------|-----------------------------------------------------------------------------------------------------------| ------------------------------------| -| SRR1952908 | aadA1, aadA2, blaTEM-57, cmlA1, gyrA (S83Y), sul3, tet(A) | streptomycin, ampicillin, chloramphenicol, ciprofloxacin I/R, nalidixic acid, sulfisoxazole, tetracycline | ColpVC, IncFIB(S), IncFII(S), IncI1 | -| SRR1952926 | blaTEM-57, gyrA (S83Y), tet(A) | ampicillin, ciprofloxacin I/R, nalidixic acid, tetracycline | ColpVC, IncFIB(S), IncFII(S), IncI1 | +| Isolate ID | Genotype | Predicted Phenotype | Plasmid | Scheme | Sequence Type | +|------------|-----------------------------------------------------------|-----------------------------------------------------------------------------------------------------------|-------------------------------------|-----------|---------------| +| SRR1952908 | aadA1, aadA2, blaTEM-57, cmlA1, gyrA (S83Y), sul3, tet(A) | streptomycin, ampicillin, chloramphenicol, ciprofloxacin I/R, nalidixic acid, sulfisoxazole, tetracycline | ColpVC, IncFIB(S), IncFII(S), IncI1 | senterica | 11 | +| SRR1952926 | blaTEM-57, gyrA (S83Y), tet(A) | ampicillin, ciprofloxacin I/R, nalidixic acid, tetracycline | ColpVC, IncFIB(S), IncFII(S), IncI1 | senterica | 11 | ## detailed_summary.tsv The **detailed_summary.tsv** output file generated by `staramr` contains the following columns: * __Isolate ID__: The id of the isolate/genome file(s) passed to `staramr`. -* __Gene/Plasmid__: The particular gene detected from ResFinder, PlasmidFinder, and PointFinder. +* __Data__: The particular gene detected from ResFinder, PlasmidFinder, PointFinder, or the sequence type. +* __Data Type__: The type of gene (Resistance or Plasmid), or MLST. * __Predicted Phenotype__: The predicted AMR phenotype (drug resistances) found in ResFinder/PointFinder. Plasmids will be left blank by default. * __%Identity__: The % identity of the top BLAST HSP to the gene. * __%Overlap__: THe % overlap of the top BLAST HSP to the gene (calculated as __hsp length/total length * 100__). @@ -272,13 +294,13 @@ The **detailed_summary.tsv** output file generated by `staramr` contains the fol * __Start__: The start of the gene (will be greater than __End__ if on minus strand). * __End__: The end of the gene. * __Accession__: The accession of the gene from either ResFinder or PlasmidFinder database. -* __Data Type__: The type of gene it is either a **Resistance** gene or a **Plasmid** gene ### Example -| Isolate ID | Gene/Plasmid | Predicted Phenotype | %Identity | %Overlap | HSP Length/Total Length | Contig | Start | End | Accession | Data Type | -|------------|--------------|-----------------------------------|-----------|----------|-------------------------|-------------|--------|--------|-----------|------------| -| SRR1952926 | IncI1 | | 100 | 100 | 142/142 | contig00017 | 3907 | 3766 | AP005147 | Plasmid | -| SRR1952926 | blaTEM-57 | ampicillin | 99.88 | 100 | 861/861 | contig00027 | 6176 | 5316 | FJ405211 | Resistance | +| Isolate ID | Data | Data Type | Predicted Phenotype | %Identity | %Overlap | HSP Length/Total Length | Contig | Start | End | Accession | +|------------|------------------|-----------|---------------------|-----------|----------|-------------------------|-------------|-------|------|-----------| +| SRR1952908 | ST11 (senterica) | MLST | | | | | | | | | +| SRR1952908 | ColpVC | Plasmid | | 98.96 | 100 | 193/193 | contig00038 | 1618 | 1426 | JX133088 | +| SRR1952908 | IncFIB(S) | Plasmid | | 98.91 | 100 | 643/643 | contig00024 | 10302 | 9660 | FN432031 | ## resfinder.tsv @@ -347,6 +369,22 @@ The **plasmidfinder.tsv** output file generated by `staramr` contains the follow | SRR1952908 | ColpVC | 98.96 | 100 | 193/193 | contig00038 | 1618 | 1426 | JX133088 | | SRR1952908 | IncFIB(S) | 98.91 | 100 | 643/643 | contig00024 | 10302 | 9660 | FN432031 | +## mlst.tsv + +The **mlst.tsv** output file generated by `staramr` contains the following columns: + +* __Isolate ID__: The id of the isolate/genome file(s) passed to `staramr`. +* __Scheme__: The scheme that `MLST` has identified. +* __Sequence Type__: The sequence type that's assigned when combining all allele types +* __Locus #__: A particular locus in the specified MLST scheme. + +### Example + +| Isolate ID | Scheme | Sequence Type | Locus 1 | Locus 2 | Locus 3 | Locus 4 | Locus 5 | Locus 6 | Locus 7 | +|------------|-----------|---------------|---------|---------|---------|---------|---------|---------|----------| +| SRR1952908 | senterica | 11 | aroC(5) | dnaN(2) | hemD(3) | hisD(7) | purE(6) | sucA(6) | thrA(11) | +| SRR1952926 | senterica | 11 | aroC(5) | dnaN(2) | hemD(3) | hisD(7) | purE(6) | sucA(6) | thrA(11) | + ## settings.txt The **settings.txt** file contains the particular settings used to run `staramr`. @@ -358,6 +396,7 @@ The **settings.txt** file contains the particular settings used to run `staramr` * __resfinder_db_url__, __pointfinder_db_url__, __plasmidfinder_db_url__: The URL to the git repository for the ResFinder, PointFinder, and PlasmidFinder databases. * __resfinder_db_commit__, __pointfinder_db_commit__, __plasmidfinder_db_commit__: The git commit ids for the ResFinder, PointFinder, and PlasmidFinder databases. * __resfinder_db_date__, __pointfinder_db_date__, __plasmidfinder_db_date__: The date of the git commits of the ResFinder, PointFinder, and PlasmidFinder databases. +* __mlst_version__: The version of `MLST`. * __pointfinder_gene_drug_version__, __resfinder_gene_drug_version__: A version identifier for the gene/drug mapping table used by `staramr`. ### Example @@ -431,7 +470,9 @@ This software is still a work-in-progress. In particular, not all organisms sto Some ideas for the software were derived from the [ResFinder][resfinder-git], [PointFinder][pointfinder-git], and [PlasmidFinder][plasmidfinder-git] command-line software, as well as from [ABRicate][abricate]. -Phenotype/drug resistance predictions are provided with support from the NARMS/CIPARS Molecular Working Group. +Phenotype/drug resistance predictions are provided with support from the NARMS/CIPARS Molecular Working Group. + +The Multi-locus sequence typing program is from the [MLST] Github. # Citations @@ -443,6 +484,10 @@ If you find `staramr` useful, please consider citing this GitHub repository (htt > **Carattoli A, Zankari E, Garcia-Fernandez A, Voldby Larsen M, Lund O, Villa L, Aarestrup FM, Hasman H**. PlasmidFinder and pMLST: in silico detection and typing of plasmids. Antimicrob. Agents Chemother. 2014. April 28th. doi: [10.1128/AAC.02412-14][plasmidfinder-cite] +>**Seemann T**, MLST Github https://github.com/tseemann/mlst + +>**Jolley KA, Bray JE and Maiden MCJ**. Open-access bacterial population genomics: BIGSdb software, the PubMLST.org website and their applications [version 1; peer review: 2 approved]. Wellcome Open Res 2018, 3:124. doi: [10.12688/wellcomeopenres.14826.1][mlst-cite] + # Legal Copyright 2018 Government of Canada @@ -465,6 +510,7 @@ specific language governing permissions and limitations under the License. [resfinder-cite]: https://dx.doi.org/10.1093/jac/dks261 [pointfinder-cite]: https://doi.org/10.1093/jac/dkx217 [plasmidfinder-cite]: https://doi.org/10.1128/AAC.02412-14 +[mlst-cite]: https://doi.org/10.12688/wellcomeopenres.14826.1 [Bioconda]: https://bioconda.github.io/ [requirements.txt]: requirements.txt [resfinder-git]: https://bitbucket.org/genomicepidemiology/resfinder @@ -479,3 +525,4 @@ specific language governing permissions and limitations under the License. [card-web]: https://card.mcmaster.ca/ [tutorial]: doc/tutorial/staramr-tutorial.ipynb [genes_to_exclude.tsv]: staramr/databases/exclude/data/genes_to_exclude.tsv +[MLST]: https://github.com/tseemann/mlst \ No newline at end of file diff --git a/doc/tutorial/staramr-tutorial.ipynb b/doc/tutorial/staramr-tutorial.ipynb index 0149045a..f600e161 100644 --- a/doc/tutorial/staramr-tutorial.ipynb +++ b/doc/tutorial/staramr-tutorial.ipynb @@ -54,10 +54,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2019-05-09 08:59:35-- ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/478/105/GCF_001478105.1_Salmonella_enterica_CVM_N31384-SQ_v1.0/GCF_001478105.1_Salmonella_enterica_CVM_N31384-SQ_v1.0_genomic.fna.gz\n", + "--2019-08-28 12:27:27-- ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/478/105/GCF_001478105.1_Salmonella_enterica_CVM_N31384-SQ_v1.0/GCF_001478105.1_Salmonella_enterica_CVM_N31384-SQ_v1.0_genomic.fna.gz\n", " => ‘GCF_001478105.1.fasta.gz’\n", - "Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.10, 2607:f220:41e:250::11\n", - "Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:21... connected.\n", + "Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.7, 2607:f220:41e:250::7\n", + "Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.7|:21... connected.\n", "Logging in as anonymous ... Logged in!\n", "==> SYST ... done. ==> PWD ... done.\n", "==> TYPE I ... done. ==> CWD (1) /genomes/all/GCF/001/478/105/GCF_001478105.1_Salmonella_enterica_CVM_N31384-SQ_v1.0 ... done.\n", @@ -65,14 +65,14 @@ "==> PASV ... done. ==> RETR GCF_001478105.1_Salmonella_enterica_CVM_N31384-SQ_v1.0_genomic.fna.gz ... done.\n", "Length: 1454519 (1.4M) (unauthoritative)\n", "\n", - "GCF_001478105.1_Sal 100%[===================>] 1.39M 5.15MB/s in 0.3s \n", + "GCF_001478105.1_Sal 100%[===================>] 1.39M 4.02MB/s in 0.3s \n", "\n", - "2019-05-09 08:59:38 (5.15 MB/s) - ‘GCF_001478105.1.fasta.gz’ saved [1454519]\n", + "2019-08-28 12:27:28 (4.02 MB/s) - ‘GCF_001478105.1.fasta.gz’ saved [1454519]\n", "\n", - "--2019-05-09 08:59:38-- ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/931/595/GCF_001931595.1_ASM193159v1/GCF_001931595.1_ASM193159v1_genomic.fna.gz\n", + "--2019-08-28 12:27:28-- ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/931/595/GCF_001931595.1_ASM193159v1/GCF_001931595.1_ASM193159v1_genomic.fna.gz\n", " => ‘GCF_001931595.1.fasta.gz’\n", - "Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.10, 2607:f220:41e:250::11\n", - "Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:21... connected.\n", + "Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.7, 2607:f220:41e:250::7\n", + "Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.7|:21... connected.\n", "Logging in as anonymous ... Logged in!\n", "==> SYST ... done. ==> PWD ... done.\n", "==> TYPE I ... done. ==> CWD (1) /genomes/all/GCF/001/931/595/GCF_001931595.1_ASM193159v1 ... done.\n", @@ -80,9 +80,9 @@ "==> PASV ... done. ==> RETR GCF_001931595.1_ASM193159v1_genomic.fna.gz ... done.\n", "Length: 1498326 (1.4M) (unauthoritative)\n", "\n", - "GCF_001931595.1_ASM 100%[===================>] 1.43M 6.13MB/s in 0.2s \n", + "GCF_001931595.1_ASM 100%[===================>] 1.43M 3.17MB/s in 0.5s \n", "\n", - "2019-05-09 08:59:42 (6.13 MB/s) - ‘GCF_001931595.1.fasta.gz’ saved [1498326]\n", + "2019-08-28 12:27:29 (3.17 MB/s) - ‘GCF_001931595.1.fasta.gz’ saved [1498326]\n", "\n" ] } @@ -151,6 +151,7 @@ "plasmidfinder_db_url = https://bitbucket.org/genomicepidemiology/plasmidfinder_db.git\n", "plasmidfinder_db_commit = 81919954cbedaff39056610ab584ab4c06011ed8\n", "plasmidfinder_db_date = Tue, 20 Nov 2018 08:51\n", + "mlst_version = 2.16.1\n", "pointfinder_gene_drug_version = 050218\n", "resfinder_gene_drug_version = 050218.1\n" ] @@ -176,22 +177,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "2019-05-09 08:59:59,327 INFO: No --plasmidfinder-database-type specified. Will search the entire PlasmidFinder database\n", - "2019-05-09 08:59:59,327 INFO: --output-dir set. All files will be output to [out]\n", - "2019-05-09 08:59:59,328 INFO: Will exclude ResFinder/PointFinder genes listed in [/home/CSCScience.ca/jtran/Projects/staramr/staramr/databases/exclude/data/genes_to_exclude.tsv]. Use --no-exclude-genes to disable\n", - "2019-05-09 08:59:59,439 INFO: Making BLAST databases for input files\n", - "2019-05-09 08:59:59,705 INFO: Scheduling blasts for GCF_001478105.1.fasta\n", - "2019-05-09 08:59:59,733 INFO: Scheduling blasts for GCF_001931595.1.fasta\n", - "2019-05-09 09:00:02,093 INFO: Finished. Took 0.05 minutes.\n", - "2019-05-09 09:00:02,094 INFO: Predicting AMR resistance phenotypes is enabled. The predictions are for microbiological resistance and *not* clinical resistance. These results are continually being improved and we welcome any feedback.\n", - "2019-05-09 09:00:02,096 INFO: Writing resfinder to [out/resfinder.tsv]\n", - "2019-05-09 09:00:02,098 INFO: Writing pointfinder to [out/pointfinder.tsv]\n", - "2019-05-09 09:00:02,099 INFO: Writing plasmidfinder to [out/plasmidfinder.tsv]\n", - "2019-05-09 09:00:02,101 INFO: Writing summary to [out/summary.tsv]\n", - "2019-05-09 09:00:02,102 INFO: Writing detailed summary to [out/detailed_summary.tsv]\n", - "2019-05-09 09:00:02,103 INFO: Writing settings to [out/settings.txt]\n", - "2019-05-09 09:00:02,103 INFO: Writing Excel to [out/results.xlsx]\n", - "2019-05-09 09:00:02,192 INFO: BLAST hits are stored in [out/hits]\n" + "\u001b[32m2019-08-28 12:27:44\u001b[0m \u001b[1;30mINFO:\u001b[0m No --plasmidfinder-database-type specified. Will search the entire PlasmidFinder database\n", + "\u001b[32m2019-08-28 12:27:44\u001b[0m \u001b[1;30mINFO:\u001b[0m --output-dir set. All files will be output to [out]\n", + "\u001b[32m2019-08-28 12:27:44\u001b[0m \u001b[1;30mINFO:\u001b[0m Will exclude ResFinder/PointFinder genes listed in [/home/CSCScience.ca/jtran/Projects/staramr/staramr/databases/exclude/data/genes_to_exclude.tsv]. Use --no-exclude-genes to disable\n", + "\u001b[32m2019-08-28 12:27:44\u001b[0m \u001b[1;30mINFO:\u001b[0m Making BLAST databases for input files\n", + "\u001b[32m2019-08-28 12:27:44\u001b[0m \u001b[1;30mINFO:\u001b[0m Scheduling blasts and MLST for GCF_001478105.1.fasta\n", + "\u001b[32m2019-08-28 12:27:44\u001b[0m \u001b[1;30mINFO:\u001b[0m Scheduling blasts and MLST for GCF_001931595.1.fasta\n", + "\u001b[32m2019-08-28 12:27:48\u001b[0m \u001b[1;30mINFO:\u001b[0m Finished. Took 0.07 minutes.\n", + "\u001b[32m2019-08-28 12:27:48\u001b[0m \u001b[1;30mINFO:\u001b[0m Predicting AMR resistance phenotypes is enabled. The predictions are for microbiological resistance and *not* clinical resistance. These results are continually being improved and we welcome any feedback.\n", + "\u001b[32m2019-08-28 12:27:48\u001b[0m \u001b[1;30mINFO:\u001b[0m Writing resfinder to [out/resfinder.tsv]\n", + "\u001b[32m2019-08-28 12:27:48\u001b[0m \u001b[1;30mINFO:\u001b[0m Writing pointfinder to [out/pointfinder.tsv]\n", + "\u001b[32m2019-08-28 12:27:48\u001b[0m \u001b[1;30mINFO:\u001b[0m Writing plasmidfinder to [out/plasmidfinder.tsv]\n", + "\u001b[32m2019-08-28 12:27:48\u001b[0m \u001b[1;30mINFO:\u001b[0m Writing summary to [out/summary.tsv]\n", + "\u001b[32m2019-08-28 12:27:48\u001b[0m \u001b[1;30mINFO:\u001b[0m Writing MLST summary to [out/mlst.tsv]\n", + "\u001b[32m2019-08-28 12:27:48\u001b[0m \u001b[1;30mINFO:\u001b[0m Writing detailed summary to [out/detailed_summary.tsv]\n", + "\u001b[32m2019-08-28 12:27:48\u001b[0m \u001b[1;30mINFO:\u001b[0m Writing settings to [out/settings.txt]\n", + "\u001b[32m2019-08-28 12:27:48\u001b[0m \u001b[1;30mINFO:\u001b[0m Writing Excel to [out/results.xlsx]\n", + "\u001b[32m2019-08-28 12:27:48\u001b[0m \u001b[1;30mINFO:\u001b[0m BLAST hits are stored in [out/hits]\n" ] } ], @@ -219,8 +221,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "detailed_summary.tsv plasmidfinder.tsv resfinder.tsv settings.txt\n", - "\u001b[0m\u001b[01;34mhits\u001b[0m pointfinder.tsv results.xlsx summary.tsv\n" + "detailed_summary.tsv plasmidfinder.tsv results.xlsx\n", + "\u001b[0m\u001b[01;34mhits\u001b[0m pointfinder.tsv settings.txt\n", + "mlst.tsv resfinder.tsv summary.tsv\n" ] } ], @@ -248,9 +251,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Isolate ID Genotype Predicted Phenotype Plasmid\n", - "GCF_001478105.1 blaCMY-2 ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone IncI1, IncX1\n", - "GCF_001931595.1 aac(3)-IVa, aph(3')-Ia, aph(4)-Ia, blaCTX-M-65, dfrA14, floR, gyrA (D87Y), sul1, tet(A) gentamicin, kanamycin, hygromicin, ampicillin, ceftriaxone, trimethoprim, chloramphenicol, ciprofloxacin I/R, nalidixic acid, sulfisoxazole, tetracycline None\n" + "Isolate ID Genotype Predicted Phenotype Plasmid Scheme Sequence Type\n", + "GCF_001478105.1 blaCMY-2 ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone IncI1, IncX1 senterica 152\n", + "GCF_001931595.1 aac(3)-IVa, aph(3')-Ia, aph(4)-Ia, blaCTX-M-65, dfrA14, floR, gyrA (D87Y), sul1, tet(A) gentamicin, kanamycin, hygromicin, ampicillin, ceftriaxone, trimethoprim, chloramphenicol, ciprofloxacin I/R, nalidixic acid, sulfisoxazole, tetracycline None senterica 32\n" ] } ], @@ -325,6 +328,26 @@ "cut -f 1,4 out/summary.tsv | column -s$'\\t' -t" ] }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Isolate ID Scheme Sequence Type\n", + "GCF_001478105.1 senterica 152\n", + "GCF_001931595.1 senterica 32\n" + ] + } + ], + "source": [ + "# Show only Multilocus Sequence Typing Results\n", + "cut -f 1,5,6 out/summary.tsv | column -s$'\\t' -t" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -334,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -361,7 +384,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -395,7 +418,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -414,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -440,7 +463,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -460,7 +483,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -485,6 +508,33 @@ "This shows all the plasmid types that were detected in the genomes." ] }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Isolate ID Scheme Sequence Type Locus 1 Locus 2 Locus 3 Locus 4 Locus 5 Locus 6 Locus 7\n", + "GCF_001478105.1 senterica 152 aroC(62) dnaN(53) hemD(54) hisD(60) purE(5) sucA(53) thrA(54)\n", + "GCF_001931595.1 senterica 32 aroC(17) dnaN(18) hemD(22) hisD(17) purE(5) sucA(21) thrA(19)\n" + ] + } + ], + "source": [ + "# Show all columns\n", + "column -s$'\\t' -t out/mlst.tsv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This shows all of the matched sequence types for a particular organism in the genomes." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -494,27 +544,29 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Isolate ID Gene/Plasmid Predicted Phenotype %Identity %Overlap HSP Length/Total Length Contig Start End Accession Data Type\n", - "GCF_001478105.1 IncI1 100.0 100.0 142/142 ref|NZ_JYVD01000056.1| 15896 16037 AP005147 Plasmid\n", - "GCF_001478105.1 IncX1 100.0 100.0 373/373 ref|NZ_JYVD01000049.1| 2546 2174 CP001123 Plasmid\n", - "GCF_001478105.1 blaCMY-2 ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone 100.0 100.0 1146/1146 ref|NZ_JYVD01000056.1| 25020 26165 X91840 Resistance\n", - "GCF_001931595.1 None Plasmid\n", - "GCF_001931595.1 aac(3)-IVa gentamicin 99.87 100.0 786/786 ref|NZ_CP016411.1| 292669 291885 X01385 Resistance\n", - "GCF_001931595.1 aph(3')-Ia kanamycin 100.0 100.0 816/816 ref|NZ_CP016411.1| 300747 301562 X62115 Resistance\n", - "GCF_001931595.1 aph(4)-Ia hygromicin 100.0 100.0 1026/1026 ref|NZ_CP016411.1| 291664 290639 V01499 Resistance\n", - "GCF_001931595.1 blaCTX-M-65 ampicillin, ceftriaxone 100.0 100.0 876/876 ref|NZ_CP016411.1| 276137 277012 GQ456158 Resistance\n", - "GCF_001931595.1 dfrA14 trimethoprim 99.79 100.0 483/483 ref|NZ_CP016411.1| 295753 296235 DQ388123 Resistance\n", - "GCF_001931595.1 floR chloramphenicol 98.19 99.92 1214/1215 ref|NZ_CP016411.1| 282827 284040 AF118107 Resistance\n", - "GCF_001931595.1 gyrA (D87Y) ciprofloxacin I/R, nalidixic acid 99.43 100.0 2637/2637 ref|NZ_CP016410.1| 1597907 1600543 Resistance\n", - "GCF_001931595.1 sul1 sulfisoxazole 100.0 100.0 840/840 ref|NZ_CP016411.1| 159156 159995 U12338 Resistance\n", - "GCF_001931595.1 tet(A) tetracycline 100.0 100.0 1200/1200 ref|NZ_CP016411.1| 172512 173711 AJ517790 Resistance\n" + "Isolate ID Data Data Type Predicted Phenotype %Identity %Overlap HSP Length/Total Length Contig Start End Accession\n", + "GCF_001478105.1 ST152 (senterica) MLST \n", + "GCF_001478105.1 IncI1 Plasmid 100.0 100.0 142/142 ref|NZ_JYVD01000056.1| 15896 16037 AP005147\n", + "GCF_001478105.1 IncX1 Plasmid 100.0 100.0 373/373 ref|NZ_JYVD01000049.1| 2546 2174 CP001123\n", + "GCF_001478105.1 blaCMY-2 Resistance ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone 100.0 100.0 1146/1146 ref|NZ_JYVD01000056.1| 25020 26165 X91840\n", + "GCF_001931595.1 ST32 (senterica) MLST \n", + "GCF_001931595.1 None Plasmid \n", + "GCF_001931595.1 aac(3)-IVa Resistance gentamicin 99.87 100.0 786/786 ref|NZ_CP016411.1| 292669 291885 X01385\n", + "GCF_001931595.1 aph(3')-Ia Resistance kanamycin 100.0 100.0 816/816 ref|NZ_CP016411.1| 300747 301562 X62115\n", + "GCF_001931595.1 aph(4)-Ia Resistance hygromicin 100.0 100.0 1026/1026 ref|NZ_CP016411.1| 291664 290639 V01499\n", + "GCF_001931595.1 blaCTX-M-65 Resistance ampicillin, ceftriaxone 100.0 100.0 876/876 ref|NZ_CP016411.1| 276137 277012 GQ456158\n", + "GCF_001931595.1 dfrA14 Resistance trimethoprim 99.79 100.0 483/483 ref|NZ_CP016411.1| 295753 296235 DQ388123\n", + "GCF_001931595.1 floR Resistance chloramphenicol 98.19 99.92 1214/1215 ref|NZ_CP016411.1| 282827 284040 AF118107\n", + "GCF_001931595.1 gyrA (D87Y) Resistance ciprofloxacin I/R, nalidixic acid 99.43 100.0 2637/2637 ref|NZ_CP016410.1| 1597907 1600543 \n", + "GCF_001931595.1 sul1 Resistance sulfisoxazole 100.0 100.0 840/840 ref|NZ_CP016411.1| 159156 159995 U12338\n", + "GCF_001931595.1 tet(A) Resistance tetracycline 100.0 100.0 1200/1200 ref|NZ_CP016411.1| 172512 173711 AJ517790\n" ] } ], @@ -525,27 +577,29 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Isolate ID Gene/Plasmid %Identity Contig Data Type\n", - "GCF_001478105.1 IncI1 100.0 ref|NZ_JYVD01000056.1| Plasmid\n", - "GCF_001478105.1 IncX1 100.0 ref|NZ_JYVD01000049.1| Plasmid\n", - "GCF_001478105.1 blaCMY-2 100.0 ref|NZ_JYVD01000056.1| Resistance\n", - "GCF_001931595.1 None Plasmid\n", - "GCF_001931595.1 aac(3)-IVa 99.87 ref|NZ_CP016411.1| Resistance\n", - "GCF_001931595.1 aph(3')-Ia 100.0 ref|NZ_CP016411.1| Resistance\n", - "GCF_001931595.1 aph(4)-Ia 100.0 ref|NZ_CP016411.1| Resistance\n", - "GCF_001931595.1 blaCTX-M-65 100.0 ref|NZ_CP016411.1| Resistance\n", - "GCF_001931595.1 dfrA14 99.79 ref|NZ_CP016411.1| Resistance\n", - "GCF_001931595.1 floR 98.19 ref|NZ_CP016411.1| Resistance\n", - "GCF_001931595.1 gyrA (D87Y) 99.43 ref|NZ_CP016410.1| Resistance\n", - "GCF_001931595.1 sul1 100.0 ref|NZ_CP016411.1| Resistance\n", - "GCF_001931595.1 tet(A) 100.0 ref|NZ_CP016411.1| Resistance\n" + "Isolate ID Data Predicted Phenotype HSP Length/Total Length Accession\n", + "GCF_001478105.1 ST152 (senterica) \n", + "GCF_001478105.1 IncI1 142/142 AP005147\n", + "GCF_001478105.1 IncX1 373/373 CP001123\n", + "GCF_001478105.1 blaCMY-2 ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone 1146/1146 X91840\n", + "GCF_001931595.1 ST32 (senterica) \n", + "GCF_001931595.1 None \n", + "GCF_001931595.1 aac(3)-IVa gentamicin 786/786 X01385\n", + "GCF_001931595.1 aph(3')-Ia kanamycin 816/816 X62115\n", + "GCF_001931595.1 aph(4)-Ia hygromicin 1026/1026 V01499\n", + "GCF_001931595.1 blaCTX-M-65 ampicillin, ceftriaxone 876/876 GQ456158\n", + "GCF_001931595.1 dfrA14 trimethoprim 483/483 DQ388123\n", + "GCF_001931595.1 floR chloramphenicol 1214/1215 AF118107\n", + "GCF_001931595.1 gyrA (D87Y) ciprofloxacin I/R, nalidixic acid 2637/2637 \n", + "GCF_001931595.1 sul1 sulfisoxazole 840/840 U12338\n", + "GCF_001931595.1 tet(A) tetracycline 1200/1200 AJ517790\n" ] } ], @@ -556,7 +610,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -564,10 +618,10 @@ "output_type": "stream", "text": [ "command_line = /home/CSCScience.ca/jtran/miniconda3/envs/staramr_dev/bin/staramr search --pointfinder-organism salmonella -o out GCF_001478105.1.fasta GCF_001931595.1.fasta\n", - "version = 0.5.1\n", - "start_time = 2019-05-09 08:59:59\n", - "end_time = 2019-05-09 09:00:02\n", - "total_minutes = 0.05\n", + "version = 0.6.0\n", + "start_time = 2019-08-28 12:27:44\n", + "end_time = 2019-08-28 12:27:48\n", + "total_minutes = 0.07\n", "resfinder_db_dir = /home/CSCScience.ca/jtran/Projects/staramr/staramr/databases/data/dist/resfinder\n", "resfinder_db_url = https://bitbucket.org/genomicepidemiology/resfinder_db.git\n", "resfinder_db_commit = e8f1eb2585cd9610c4034a54ce7fc4f93aa95535\n", @@ -580,6 +634,7 @@ "plasmidfinder_db_url = https://bitbucket.org/genomicepidemiology/plasmidfinder_db.git\n", "plasmidfinder_db_commit = 81919954cbedaff39056610ab584ab4c06011ed8\n", "plasmidfinder_db_date = Tue, 20 Nov 2018 08:51\n", + "mlst_version = 2.16.1\n", "pointfinder_gene_drug_version = 050218\n", "resfinder_gene_drug_version = 050218.1\n" ] @@ -598,7 +653,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -623,7 +678,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -658,7 +713,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -678,7 +733,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 29, "metadata": {}, "outputs": [ { diff --git a/images/search_command.png b/images/search_command.png index 53ab10f0..5befa85a 100644 Binary files a/images/search_command.png and b/images/search_command.png differ diff --git a/images/settings_example.png b/images/settings_example.png index 7e33b2ad..176f0fbf 100644 Binary files a/images/settings_example.png and b/images/settings_example.png differ diff --git a/staramr/blast/JobHandler.py b/staramr/blast/JobHandler.py index 92ae6e2f..00915062 100644 --- a/staramr/blast/JobHandler.py +++ b/staramr/blast/JobHandler.py @@ -48,6 +48,7 @@ def __init__(self, blast_database_objects_map: Dict[str, AbstractBlastDatabase], raise Exception("threads is None") self._threads = threads + self._mlst_version = None if output_directory is None: raise Exception("output_directory is None") @@ -65,6 +66,7 @@ def __init__(self, blast_database_objects_map: Dict[str, AbstractBlastDatabase], self._thread_pool_executor = ThreadPoolExecutor(max_workers=self._threads) self._max_mlst_columns = 10 + self.reset() def reset(self): @@ -185,7 +187,7 @@ def _get_blast_map(self, name: str) -> Dict: def _get_mlst_data(self) -> str: - return self._mlst_data; + return self._mlst_data def _get_future_blasts_from_map(self, name: str) -> Dict: if name not in self._future_blasts_map: @@ -193,6 +195,24 @@ def _get_future_blasts_from_map(self, name: str) -> Dict: return self._future_blasts_map[name] + def _get_mlst_version(self) -> str: + command = ['mlst', '--version'] + + try: + output = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) + + mlst_version = str(output.stdout, 'utf-8') + + # Parses out the mlst when the string is given back ex `mlst 2.x.x` and removes new line + mlst_version = (mlst_version[5:]).rstrip() + + except subprocess.CalledProcessError as e: + err_msg = str(e.stderr.strip()) + + raise Exception('Could not run mlst, error {}'.format(err_msg)) + + return mlst_version + def is_pointfinder_configured(self) -> bool: """ Whether or not PointFinder is being used. @@ -232,6 +252,10 @@ def get_mlst_outputs(self) -> str: return self._get_mlst_data(); + def get_mlst_version(self) -> str: + + return self._get_mlst_version(self) # type: ignore + def get_pointfinder_outputs(self) -> Dict: """ Gets the PointFinder output files in the form of a dictionary which looks like: diff --git a/staramr/subcommand/Database.py b/staramr/subcommand/Database.py index f649129b..18da93ff 100644 --- a/staramr/subcommand/Database.py +++ b/staramr/subcommand/Database.py @@ -8,6 +8,7 @@ from staramr.SubCommand import SubCommand from staramr.Utils import get_string_with_spacing +from staramr.blast.JobHandler import JobHandler from staramr.databases.AMRDatabasesManager import AMRDatabasesManager from staramr.databases.resistance.ARGDrugTable import ARGDrugTable from staramr.exceptions.CommandParseException import CommandParseException @@ -294,8 +295,11 @@ def run(self, args): try: database_info = database_repos.info() + database_info['mlst_version'] = JobHandler.get_mlst_version(JobHandler) + database_info.update(arg_drug_table.get_resistance_table_info()) sys.stdout.write(get_string_with_spacing(database_info)) + except DatabaseNotFoundException as e: logger.error("No database found. Perhaps try restoring the default with 'staramr db restore-default'") else: diff --git a/staramr/subcommand/Search.py b/staramr/subcommand/Search.py index a287f927..f47403f4 100644 --- a/staramr/subcommand/Search.py +++ b/staramr/subcommand/Search.py @@ -268,6 +268,8 @@ def _generate_results(self, database_repos, resfinder_database, pointfinder_data logger.info("Finished. Took %s minutes.", time_difference_minutes) settings = database_repos.info() + + settings['mlst_version'] = JobHandler.get_mlst_version(JobHandler) settings['command_line'] = ' '.join(sys.argv) settings['version'] = self._version settings['start_time'] = start_time.strftime(self.TIME_FORMAT)