Skip to content

Commit

Permalink
Merge pull request #96 from phac-nml/development
Browse files Browse the repository at this point in the history
Release 0.6.0
  • Loading branch information
apetkau authored Sep 12, 2019
2 parents 6f6f65d + a6e7f95 commit 741c06c
Show file tree
Hide file tree
Showing 28 changed files with 918 additions and 211 deletions.
3 changes: 3 additions & 0 deletions .mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ ignore_missing_imports = True

[mypy-numpy.*]
ignore_missing_imports = True

[mypy-coloredlogs.*]
ignore_missing_imports = True
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ install:
- hash -r
- conda update -q -y conda
- conda info -a
- conda create -c bioconda -c conda-forge -q -y -n test-environment python=$TRAVIS_PYTHON_VERSION blast=2.7.1 git
- conda create -c bioconda -c conda-forge -q -y -n test-environment python=$TRAVIS_PYTHON_VERSION blast=2.7.1 git mlst
- source activate test-environment
- python setup.py install
- staramr db build --dir staramr/databases/data $DATABASE_COMMITS
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Version 0.6.0

* Added [coloredlogs](https://pypi.org/project/coloredlogs/) library to format the output
* Added support for [MLST](https://github.com/tseemann/mlst)

# Version 0.5.1

* Renamed the following columns for clarification:
Expand Down
109 changes: 78 additions & 31 deletions README.md

Large diffs are not rendered by default.

207 changes: 131 additions & 76 deletions doc/tutorial/staramr-tutorial.ipynb

Large diffs are not rendered by default.

Binary file modified images/search_command.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/settings_example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

setup(name='staramr',
version=__version__,
description='Scans genome contigs against ResFinder and PointFinder databases',
description='Scans genome contigs against ResFinder, PlasmidFinder, and PointFinder databases',
author='Aaron Petkau',
author_email='[email protected]',
url='https://github.com/phac-nml/staramr',
Expand All @@ -30,7 +30,8 @@
'GitPython>=2.1.3',
'xlsxwriter>=1.0.2',
'numpy>=1.12.1',
'green>=2.13.0'
'green>=2.13.0',
'coloredlogs>=10.0'
],
packages=find_packages(),
include_package_data=True,
Expand Down
6 changes: 3 additions & 3 deletions staramr/SubCommand.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import abc
import logging
import coloredlogs

"""
Abstract class for any sub-commands for the command-line application.
Expand Down Expand Up @@ -34,7 +35,6 @@ def run(self, args):
:return: None
"""
if args.verbose:
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(module)s.%(funcName)s,%(lineno)s: %(message)s')
coloredlogs.install(level='DEBUG', fmt='%(asctime)s %(levelname)s %(name)s.%(funcName)s,%(lineno)s: %(message)s')
else:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
coloredlogs.install(level='INFO', fmt='%(asctime)s %(levelname)s: %(message)s')
2 changes: 1 addition & 1 deletion staramr/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.5.1'
__version__ = '0.6.0'
95 changes: 86 additions & 9 deletions staramr/blast/BlastHandler.py → staramr/blast/JobHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,24 @@
import os
import re
import subprocess
import math
from concurrent.futures import ThreadPoolExecutor
from os import path
from typing import Dict
from typing import Dict, List

from Bio.Blast.Applications import NcbiblastnCommandline

from staramr.blast.AbstractBlastDatabase import AbstractBlastDatabase
from staramr.exceptions.BlastProcessError import BlastProcessError

logger = logging.getLogger('BlastHandler')
logger = logging.getLogger('JobHandler')

"""
Class for handling scheduling of BLAST jobs.
"""


class BlastHandler:
class JobHandler:
BLAST_COLUMNS = [x.strip() for x in '''
qseqid
sseqid
Expand All @@ -38,7 +39,7 @@ class BlastHandler:
def __init__(self, blast_database_objects_map: Dict[str, AbstractBlastDatabase], threads: int,
output_directory: str) -> None:
"""
Creates a new BlastHandler.
Creates a new JobHandler.
:param blast_database_objects_map: A map containing the blast databases.
:param threads: The maximum number of threads to use, where one BLAST process gets assigned to one thread.
:param output_directory: The output directory to store BLAST results.
Expand All @@ -47,6 +48,7 @@ def __init__(self, blast_database_objects_map: Dict[str, AbstractBlastDatabase],
raise Exception("threads is None")

self._threads = threads
self._mlst_version = None

if output_directory is None:
raise Exception("output_directory is None")
Expand All @@ -62,41 +64,60 @@ def __init__(self, blast_database_objects_map: Dict[str, AbstractBlastDatabase],
else:
self._pointfinder_configured = True # type: bool

self._thread_pool_executor = None
self._thread_pool_executor = ThreadPoolExecutor(max_workers=self._threads)
self._max_mlst_columns = 10

self.reset()

def reset(self):
"""
Resets this BlastHandler.
Resets this JobHandler.
:return: None
"""
if self._thread_pool_executor is not None:
self._thread_pool_executor.shutdown()
self._thread_pool_executor = ThreadPoolExecutor(max_workers=self._threads)
self._blast_map = {}
self._future_blasts_map = {}
self._mlst_data = ""
self._mlst_map = {}

if path.exists(self._input_genomes_tmp_dir):
logger.debug("Directory [%s] already exists", self._input_genomes_tmp_dir)
else:
os.mkdir(self._input_genomes_tmp_dir)

def run_blasts(self, files) -> None:
def run_blasts_mlst(self, files, mlst_scheme) -> None:
"""
Scans all files with BLAST against the ResFinder/PointFinder/Plasmid databases.
Scans all files with BLAST against the ResFinder/PointFinder/Plasmid databases and scans all files with MLST
:param files: The files to scan.
:param mlst_scheme: Specifies scheme name for MLST to use.
:return: None
"""
db_files = self._make_db_from_input_files(self._input_genomes_tmp_dir, files)
logger.debug("Done making blast databases for input files")

future_mlst_db = [] # type: list

for file in db_files:
logger.info("Scheduling blasts for %s", path.basename(file))

logger.info("Scheduling blasts and MLST for %s", path.basename(file))
future_mlst_db.append(self._thread_pool_executor.submit(self._schedule_mlst, file, mlst_scheme))

for name in self._blast_database_objects_map:
database_object = self._blast_database_objects_map[name]
self._schedule_blast(file, database_object)

try:
for future_mlst in future_mlst_db:
mlst_result = future_mlst.result()

self._mlst_data += mlst_result

except subprocess.CalledProcessError as e:
err_msg = str(e.stderr.strip())
raise Exception('Could not run mlst, error {}'.format(err_msg))

def _make_db_from_input_files(self, db_dir, files):
logger.info("Making BLAST databases for input files")
future_makeblastdbs = []
Expand All @@ -119,6 +140,28 @@ def _make_db_from_input_files(self, db_dir, files):

return db_files

def _schedule_mlst(self, file: str, mlst_scheme: str) -> str:

command = ['mlst']

if mlst_scheme is not None:
command.extend(['--scheme', mlst_scheme])

command.append(file);

logger.debug(' '.join(command))
try:
output = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)

decoded_output = str(output.stdout, 'utf-8')

except subprocess.CalledProcessError as e:
err_msg = str(e.stderr.strip())

raise Exception('Could not run mlst, error {}'.format(err_msg))

return decoded_output

def _schedule_blast(self, file, blast_database):
database_names = blast_database.get_database_names()
logger.debug("%s databases: %s", blast_database.get_name(), database_names)
Expand All @@ -142,12 +185,34 @@ def _get_blast_map(self, name: str) -> Dict:

return self._blast_map[name]

def _get_mlst_data(self) -> str:

return self._mlst_data

def _get_future_blasts_from_map(self, name: str) -> Dict:
if name not in self._future_blasts_map:
self._future_blasts_map[name] = []

return self._future_blasts_map[name]

def _get_mlst_version(self) -> str:
command = ['mlst', '--version']

try:
output = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)

mlst_version = str(output.stdout, 'utf-8')

# Parses out the mlst when the string is given back ex `mlst 2.x.x` and removes new line
mlst_version = (mlst_version[5:]).rstrip()

except subprocess.CalledProcessError as e:
err_msg = str(e.stderr.strip())

raise Exception('Could not run mlst, error {}'.format(err_msg))

return mlst_version

def is_pointfinder_configured(self) -> bool:
"""
Whether or not PointFinder is being used.
Expand Down Expand Up @@ -179,6 +244,18 @@ def get_plasmidfinder_outputs(self) -> Dict:
future_blast.result()
return self._get_blast_map('plasmidfinder')

def get_mlst_outputs(self) -> str:
"""
Gets the MLST output files from the MLST subprocess
:return A decoded parsed list that contains all of the found locus in each file
"""

return self._get_mlst_data();

def get_mlst_version(self) -> str:

return self._get_mlst_version(self) # type: ignore

def get_pointfinder_outputs(self) -> Dict:
"""
Gets the PointFinder output files in the form of a dictionary which looks like:
Expand Down
4 changes: 2 additions & 2 deletions staramr/blast/results/BlastResultsParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numpy as np
import pandas as pd

from staramr.blast.BlastHandler import BlastHandler
from staramr.blast.JobHandler import JobHandler
from staramr.blast.results.BlastHitPartitions import BlastHitPartitions

logger = logging.getLogger('BlastResultsParser')
Expand Down Expand Up @@ -86,7 +86,7 @@ def _get_out_file_name(self, in_file):
pass

def _handle_blast_hit(self, in_file, database_name, blast_file, results, hit_seq_records):
blast_table = pd.read_csv(blast_file, sep='\t', header=None, names=BlastHandler.BLAST_COLUMNS,
blast_table = pd.read_csv(blast_file, sep='\t', header=None, names=JobHandler.BLAST_COLUMNS,
index_col=False).astype(
dtype={'qseqid': np.unicode_, 'sseqid': np.unicode_})
partitions = BlastHitPartitions()
Expand Down
13 changes: 12 additions & 1 deletion staramr/blast/results/pointfinder/PointfinderHitHSP.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,22 @@ def _get_match_positions(self):
return [i for i, (x, y) in enumerate(zip(amr_seq, genome_seq)) if x != y]

def _get_mutation_positions(self, start):
mutation_positions_filtered = []
codon_starts = []

amr_seq = self.get_amr_gene_seq()
genome_seq = self.get_genome_contig_hsp_seq()

# Only return mutation position objects with unique codon start positions
mutation_positions = [CodonMutationPosition(i, amr_seq, genome_seq, start) for i in self._get_match_positions()]

for m in mutation_positions:
if m._codon_start not in codon_starts:
codon_starts.append(m._codon_start)
mutation_positions_filtered.append(m)

# @formatter:off
return [CodonMutationPosition(i, amr_seq, genome_seq, start) for i in self._get_match_positions()]
return mutation_positions_filtered
# @formatter:on

def get_mutations(self):
Expand Down
19 changes: 13 additions & 6 deletions staramr/blast/results/pointfinder/codon/CodonMutationPosition.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,21 +47,21 @@ def get_database_amr_gene_codon(self):

def get_database_amr_gene_amino_acid(self):
"""
Gets the corresponding amino acid from the amr gene. If there is an indel, returns 'X'.
Gets the corresponding amino acid from the amr gene. If there is an insertion, returns 'ins'.
:return: The amino acid from the amr gene.
"""
if '-' in self.get_database_amr_gene_codon():
return 'X'
return 'ins'
else:
return Bio.Seq.translate(self.get_database_amr_gene_codon(), table='Standard')

def get_input_genome_amino_acid(self):
"""
Gets the corresponding amino acid from the genome. If there is an indel returns 'X'.
Gets the corresponding amino acid from the genome. If there is a deletion returns 'del'.
:return: The amino acid from the genome.
"""
if '-' in self.get_input_genome_codon():
return 'X'
return 'del'
else:
return Bio.Seq.translate(self.get_input_genome_codon(), table='Standard')

Expand All @@ -80,10 +80,17 @@ def get_mutation_string(self):
+ ' -> ' + self.get_input_genome_amino_acid() + ')'

def get_database_amr_gene_mutation(self):
return self.get_database_amr_gene_amino_acid().upper()
if '-' in self.get_database_amr_gene_codon():
return self.get_database_amr_gene_amino_acid()
else:
return self.get_database_amr_gene_amino_acid().upper()

def get_input_genome_mutation(self):
return self.get_input_genome_amino_acid().upper()
# Keep 'ins' or 'del' lowercase
if '-' in self.get_input_genome_codon():
return self.get_input_genome_amino_acid()
else:
return self.get_input_genome_amino_acid().upper()

def get_type(self):
return 'codon'
Expand Down
Loading

0 comments on commit 741c06c

Please sign in to comment.