Merge pull request #96 from phac-nml/development

Release 0.6.0
phac-nml · Sep 12, 2019 · 741c06c · 741c06c
2 parents 6f6f65d + a6e7f95
commit 741c06c
Show file tree

Hide file tree

Showing 28 changed files with 918 additions and 211 deletions.
diff --git a/.mypy.ini b/.mypy.ini
@@ -13,3 +13,6 @@ ignore_missing_imports = True
 
 [mypy-numpy.*]
 ignore_missing_imports = True
+
+[mypy-coloredlogs.*]
+ignore_missing_imports = True
diff --git a/.travis.yml b/.travis.yml
@@ -16,7 +16,7 @@ install:
   - hash -r
   - conda update -q -y conda
   - conda info -a
-  - conda create -c bioconda -c conda-forge -q -y -n test-environment python=$TRAVIS_PYTHON_VERSION blast=2.7.1 git
+  - conda create -c bioconda -c conda-forge -q -y -n test-environment python=$TRAVIS_PYTHON_VERSION blast=2.7.1 git mlst
   - source activate test-environment
   - python setup.py install
   - staramr db build --dir staramr/databases/data $DATABASE_COMMITS

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+# Version 0.6.0
+
+* Added [coloredlogs](https://pypi.org/project/coloredlogs/) library to format the output
+* Added support for [MLST](https://github.com/tseemann/mlst)
+
 # Version 0.5.1
 
 * Renamed the following columns for clarification:

diff --git a/README.md b/README.md
diff --git a/doc/tutorial/staramr-tutorial.ipynb b/doc/tutorial/staramr-tutorial.ipynb
diff --git a/images/search_command.png b/images/search_command.png
diff --git a/images/settings_example.png b/images/settings_example.png
diff --git a/setup.py b/setup.py
@@ -18,7 +18,7 @@
 
 setup(name='staramr',
       version=__version__,
-      description='Scans genome contigs against ResFinder and PointFinder databases',
+      description='Scans genome contigs against ResFinder, PlasmidFinder, and PointFinder databases',
       author='Aaron Petkau',
       author_email='[email protected]',
       url='https://github.com/phac-nml/staramr',
@@ -30,7 +30,8 @@
           'GitPython>=2.1.3',
           'xlsxwriter>=1.0.2',
           'numpy>=1.12.1',
-          'green>=2.13.0'
+          'green>=2.13.0',
+          'coloredlogs>=10.0'
       ],
       packages=find_packages(),
       include_package_data=True,

diff --git a/staramr/SubCommand.py b/staramr/SubCommand.py
@@ -1,5 +1,6 @@
 import abc
 import logging
+import coloredlogs
 
 """
 Abstract class for any sub-commands for the command-line application.
@@ -34,7 +35,6 @@ def run(self, args):
         :return: None
         """
         if args.verbose:
-            logging.basicConfig(level=logging.DEBUG,
-                                format='%(asctime)s %(levelname)s %(module)s.%(funcName)s,%(lineno)s: %(message)s')
+            coloredlogs.install(level='DEBUG', fmt='%(asctime)s %(levelname)s %(name)s.%(funcName)s,%(lineno)s: %(message)s')
         else:
-            logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
+            coloredlogs.install(level='INFO', fmt='%(asctime)s %(levelname)s: %(message)s')
diff --git a/staramr/__init__.py b/staramr/__init__.py
@@ -1 +1 @@
-__version__ = '0.5.1'
+__version__ = '0.6.0'
diff --git a/staramr/blast/BlastHandler.py → staramr/blast/JobHandler.py b/staramr/blast/BlastHandler.py → staramr/blast/JobHandler.py
@@ -2,23 +2,24 @@
 import os
 import re
 import subprocess
+import math
 from concurrent.futures import ThreadPoolExecutor
 from os import path
-from typing import Dict
+from typing import Dict, List
 
 from Bio.Blast.Applications import NcbiblastnCommandline
 
 from staramr.blast.AbstractBlastDatabase import AbstractBlastDatabase
 from staramr.exceptions.BlastProcessError import BlastProcessError
 
-logger = logging.getLogger('BlastHandler')
+logger = logging.getLogger('JobHandler')
 
 """
 Class for handling scheduling of BLAST jobs.
 """
 
 
-class BlastHandler:
+class JobHandler:
     BLAST_COLUMNS = [x.strip() for x in '''
     qseqid
     sseqid
@@ -38,7 +39,7 @@ class BlastHandler:
     def __init__(self, blast_database_objects_map: Dict[str, AbstractBlastDatabase], threads: int,
                  output_directory: str) -> None:
         """
-        Creates a new BlastHandler.
+        Creates a new JobHandler.
         :param blast_database_objects_map: A map containing the blast databases.
         :param threads: The maximum number of threads to use, where one BLAST process gets assigned to one thread.
         :param output_directory: The output directory to store BLAST results.
@@ -47,6 +48,7 @@ def __init__(self, blast_database_objects_map: Dict[str, AbstractBlastDatabase],
             raise Exception("threads is None")
 
         self._threads = threads
+        self._mlst_version = None
 
         if output_directory is None:
             raise Exception("output_directory is None")
@@ -62,41 +64,60 @@ def __init__(self, blast_database_objects_map: Dict[str, AbstractBlastDatabase],
         else:
             self._pointfinder_configured = True  # type: bool
 
-        self._thread_pool_executor = None
+        self._thread_pool_executor = ThreadPoolExecutor(max_workers=self._threads)
+        self._max_mlst_columns = 10
+
         self.reset()
 
     def reset(self):
         """
-        Resets this BlastHandler.
+        Resets this JobHandler.
         :return: None
         """
         if self._thread_pool_executor is not None:
             self._thread_pool_executor.shutdown()
         self._thread_pool_executor = ThreadPoolExecutor(max_workers=self._threads)
         self._blast_map = {}
         self._future_blasts_map = {}
+        self._mlst_data = ""
+        self._mlst_map = {}
 
         if path.exists(self._input_genomes_tmp_dir):
             logger.debug("Directory [%s] already exists", self._input_genomes_tmp_dir)
         else:
             os.mkdir(self._input_genomes_tmp_dir)
 
-    def run_blasts(self, files) -> None:
+    def run_blasts_mlst(self, files, mlst_scheme) -> None:
         """
-        Scans all files with BLAST against the ResFinder/PointFinder/Plasmid databases.
+        Scans all files with BLAST against the ResFinder/PointFinder/Plasmid databases and scans all files with MLST
         :param files: The files to scan.
+        :param mlst_scheme: Specifies scheme name for MLST to use.
         :return: None
         """
         db_files = self._make_db_from_input_files(self._input_genomes_tmp_dir, files)
         logger.debug("Done making blast databases for input files")
 
+        future_mlst_db = [] # type: list
+
         for file in db_files:
-            logger.info("Scheduling blasts for %s", path.basename(file))
+
+            logger.info("Scheduling blasts and MLST for %s", path.basename(file))
+            future_mlst_db.append(self._thread_pool_executor.submit(self._schedule_mlst, file, mlst_scheme))
 
             for name in self._blast_database_objects_map:
                 database_object = self._blast_database_objects_map[name]
                 self._schedule_blast(file, database_object)
 
+        try:
+            for future_mlst in future_mlst_db:
+                mlst_result = future_mlst.result()
+
+                self._mlst_data += mlst_result
+
+        except subprocess.CalledProcessError as e:
+            err_msg = str(e.stderr.strip())
+            raise Exception('Could not run mlst, error {}'.format(err_msg))
+
     def _make_db_from_input_files(self, db_dir, files):
         logger.info("Making BLAST databases for input files")
         future_makeblastdbs = []
@@ -119,6 +140,28 @@ def _make_db_from_input_files(self, db_dir, files):
 
         return db_files
 
+    def _schedule_mlst(self, file: str, mlst_scheme: str) -> str:
+
+        command = ['mlst']
+
+        if mlst_scheme is not None:
+            command.extend(['--scheme', mlst_scheme])
+
+        command.append(file);
+
+        logger.debug(' '.join(command))
+        try:
+            output = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+
+            decoded_output = str(output.stdout, 'utf-8')
+
+        except subprocess.CalledProcessError as e:
+            err_msg = str(e.stderr.strip())
+
+            raise Exception('Could not run mlst, error {}'.format(err_msg))
+
+        return decoded_output
+
     def _schedule_blast(self, file, blast_database):
         database_names = blast_database.get_database_names()
         logger.debug("%s databases: %s", blast_database.get_name(), database_names)
@@ -142,12 +185,34 @@ def _get_blast_map(self, name: str) -> Dict:
 
         return self._blast_map[name]
 
+    def _get_mlst_data(self) -> str:
+
+        return self._mlst_data
+
     def _get_future_blasts_from_map(self, name: str) -> Dict:
         if name not in self._future_blasts_map:
             self._future_blasts_map[name] = []
 
         return self._future_blasts_map[name]
 
+    def _get_mlst_version(self) -> str:
+      command = ['mlst', '--version']
+
+      try:
+          output = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+
+          mlst_version = str(output.stdout, 'utf-8')
+
+          # Parses out the mlst when the string is given back ex `mlst 2.x.x` and removes new line
+          mlst_version = (mlst_version[5:]).rstrip()
+
+      except subprocess.CalledProcessError as e:
+          err_msg = str(e.stderr.strip())
+
+          raise Exception('Could not run mlst, error {}'.format(err_msg))
+
+      return mlst_version
+
     def is_pointfinder_configured(self) -> bool:
         """
         Whether or not PointFinder is being used.
@@ -179,6 +244,18 @@ def get_plasmidfinder_outputs(self) -> Dict:
             future_blast.result()
         return self._get_blast_map('plasmidfinder')
 
+    def get_mlst_outputs(self) -> str:
+        """
+        Gets the MLST output files from the MLST subprocess
+        :return A decoded parsed list that contains all of the found locus in each file
+        """
+
+        return self._get_mlst_data();
+
+    def get_mlst_version(self) -> str:
+
+      return self._get_mlst_version(self) # type: ignore
+
     def get_pointfinder_outputs(self) -> Dict:
         """
         Gets the PointFinder output files in the form of a dictionary which looks like:

diff --git a/staramr/blast/results/BlastResultsParser.py b/staramr/blast/results/BlastResultsParser.py
@@ -7,7 +7,7 @@
 import numpy as np
 import pandas as pd
 
-from staramr.blast.BlastHandler import BlastHandler
+from staramr.blast.JobHandler import JobHandler
 from staramr.blast.results.BlastHitPartitions import BlastHitPartitions
 
 logger = logging.getLogger('BlastResultsParser')
@@ -86,7 +86,7 @@ def _get_out_file_name(self, in_file):
         pass
 
     def _handle_blast_hit(self, in_file, database_name, blast_file, results, hit_seq_records):
-        blast_table = pd.read_csv(blast_file, sep='\t', header=None, names=BlastHandler.BLAST_COLUMNS,
+        blast_table = pd.read_csv(blast_file, sep='\t', header=None, names=JobHandler.BLAST_COLUMNS,
                                   index_col=False).astype(
             dtype={'qseqid': np.unicode_, 'sseqid': np.unicode_})
         partitions = BlastHitPartitions()

diff --git a/staramr/blast/results/pointfinder/PointfinderHitHSP.py b/staramr/blast/results/pointfinder/PointfinderHitHSP.py
@@ -34,11 +34,22 @@ def _get_match_positions(self):
         return [i for i, (x, y) in enumerate(zip(amr_seq, genome_seq)) if x != y]
 
     def _get_mutation_positions(self, start):
+        mutation_positions_filtered = []
+        codon_starts = []
+
         amr_seq = self.get_amr_gene_seq()
         genome_seq = self.get_genome_contig_hsp_seq()
 
+        # Only return mutation position objects with unique codon start positions
+        mutation_positions = [CodonMutationPosition(i, amr_seq, genome_seq, start) for i in self._get_match_positions()]
+
+        for m in mutation_positions:
+            if m._codon_start not in codon_starts:
+                codon_starts.append(m._codon_start)
+                mutation_positions_filtered.append(m)
+
         # @formatter:off
-        return [CodonMutationPosition(i, amr_seq, genome_seq, start) for i in self._get_match_positions()]
+        return mutation_positions_filtered
         # @formatter:on
 
     def get_mutations(self):

diff --git a/staramr/blast/results/pointfinder/codon/CodonMutationPosition.py b/staramr/blast/results/pointfinder/codon/CodonMutationPosition.py
@@ -47,21 +47,21 @@ def get_database_amr_gene_codon(self):
 
     def get_database_amr_gene_amino_acid(self):
         """
-        Gets the corresponding amino acid from the amr gene. If there is an indel, returns 'X'.
+        Gets the corresponding amino acid from the amr gene. If there is an insertion, returns 'ins'.
         :return: The amino acid from the amr gene.
         """
         if '-' in self.get_database_amr_gene_codon():
-            return 'X'
+            return 'ins'
         else:
             return Bio.Seq.translate(self.get_database_amr_gene_codon(), table='Standard')
 
     def get_input_genome_amino_acid(self):
         """
-        Gets the corresponding amino acid from the genome.  If there is an indel returns 'X'.
+        Gets the corresponding amino acid from the genome.  If there is a deletion returns 'del'.
         :return: The amino acid from the genome.
         """
         if '-' in self.get_input_genome_codon():
-            return 'X'
+            return 'del'
         else:
             return Bio.Seq.translate(self.get_input_genome_codon(), table='Standard')
 
@@ -80,10 +80,17 @@ def get_mutation_string(self):
                + ' -> ' + self.get_input_genome_amino_acid() + ')'
 
     def get_database_amr_gene_mutation(self):
-        return self.get_database_amr_gene_amino_acid().upper()
+        if '-' in self.get_database_amr_gene_codon():
+            return self.get_database_amr_gene_amino_acid()
+        else:
+            return self.get_database_amr_gene_amino_acid().upper()
 
     def get_input_genome_mutation(self):
-        return self.get_input_genome_amino_acid().upper()
+        # Keep 'ins' or 'del' lowercase 
+        if '-' in self.get_input_genome_codon():
+            return self.get_input_genome_amino_acid()
+        else:
+            return self.get_input_genome_amino_acid().upper()
 
     def get_type(self):
         return 'codon'