phac-nml · DarianHole · Apr 17, 2024 · Apr 17, 2024
diff --git a/bin/primers_to_amplicons.py b/bin/primers_to_amplicons.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+'''Script to create amplicon bed file from primer bed file with minimal dependencies'''
+
+import argparse
+import csv
+import re
+from collections import defaultdict
+from typing import Generator
+
+def init_parser() -> argparse.ArgumentParser:
+    """
+    Specify command line arguments
+    Returns command line parser with inputs
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-b',
+        '--bed',
+        required=True,
+        type=str,
+        help='Path to input primer bed file'
+    )
+    parser.add_argument(
+        '-o',
+        '--outfile',
+        required=False,
+        default='amplicon.bed',
+        type=str,
+        help='Name for output amplicon bed file. Default: "amplicon.bed"'
+    )
+    return parser
+
+def primer_pair_generator(bed: str, primer_rgx=r'^(.*)_(LEFT|RIGHT).*$') -> Generator[str, list, list]:
+    '''
+    Generate and yield primer pairs based on the primer name
+        Primers are paired through the primer_rgx which requires the names to contain:
+            _LEFT or _RIGHT
+    '''
+    pair_dict = defaultdict(lambda: [None, None])
+    with open(bed, 'r') as handle:
+        reader = csv.reader(handle, delimiter='\t')
+        for row in reader:
+            # Bed file needs at least 5 rows (chrom, start, stop, name, pool)
+            if len(row) < 5:
+                continue
+            name = str(row[3])
+
+            # Pair based on if LEFT or RIGHT match and yield upon a pair
+            primer_match = re.search(primer_rgx, name)
+            if primer_match:
+                if primer_match.group(2) == 'LEFT':
+                    primer_prefix = primer_match.group(1)
+                    if primer_prefix not in pair_dict:
+                        pair_dict[primer_prefix][0] = row
+                    # In case of alt primers, don't want to yield yet
+                    elif pair_dict[primer_prefix][0] != None:
+                        print(f'Skipping {name} - already has a primer in dict')
+                        continue
+                    else:
+                        yield primer_prefix, pair_dict[primer_prefix][0], row
+                        del pair_dict[primer_prefix]
+                elif primer_match.group(2) == 'RIGHT':
+                    primer_prefix = primer_match.group(1)
+                    if primer_prefix not in pair_dict:
+                        pair_dict[primer_prefix][1] = row
+                    # In case of alt primers, don't want to yield yet
+                    elif pair_dict[primer_prefix][1] != None:
+                        print(f'Skipping {name} - already has a primer in dict')
+                        continue
+                    else:
+                        yield primer_prefix, pair_dict[primer_prefix][0], row
+                        del pair_dict[primer_prefix]
+
+def find_primer_prefix(primer_names: list) -> str:
+    '''
+    '''
+    prefix = ""
+
+    # Iterate through characters and check that they all match
+    for i in range(len(primer_names[0])):
+        if all(s[i] == primer_names[0][i] for s in primer_names):
+            prefix += primer_names[0][i]
+        else:
+            break
+
+    # Don't end with an '_'
+    if prefix.endswith('_'):
+        prefix = prefix[0:-1]
+
+    return prefix
+
+
+def main() -> None:
+    '''Run the program'''
+    # Init Parser and set arguments
+    parser = init_parser()
+    args = parser.parse_args()
+
+    # Use variables to find the starting and end position of the scheme
+    scheme_start = 1000000000
+    scheme_end = 0
+    primer_names = []
+
+    # Write amplicon bed file
+    with open(args.outfile, 'w') as f:
+        for ppair in primer_pair_generator(args.bed):
+            # ppair = [ primer_name, [primer-left] , [primer-right] ]
+            # Check starts and stops to make sure that start < stop and if not switches them
+            start_l, stop_l, start_r, stop_r = int(ppair[1][1]), int(ppair[1][2]), int(ppair[2][1]), int(ppair[2][2])
+            if start_l > stop_l:
+                start_l, stop_l = stop_l, start_l
+            if start_r > stop_r:
+                start_r, stop_r = stop_r, start_r
+
+            # Track overall genome start and end
+            if scheme_start > stop_l:
+                scheme_start = stop_l
+            if scheme_end < start_r:
+                scheme_end = start_r
+
+            # Track primer names
+            primer_names.append(ppair[0])
+
+            # Write output
+            chrom = ppair[1][0]
+            pname = ppair[0]
+            pool = ppair[1][4]
+            f.write(f'{chrom}\t{stop_l}\t{start_r}\t{pname}\t{pool}\t+\n')
+
+    # Write the region tiled to file in case it is needed
+    with open(f'tiling_region.bed', 'w') as f:
+        f.write(f'{chrom}\t{scheme_start}\t{scheme_end}\n')
+
+    # Get prefix for the primers in case it is needed
+    primer_prefix = find_primer_prefix(primer_names)
+    with open(f'primer_prefix.txt', 'w') as f:
+        f.write(primer_prefix)
+
+if __name__ == '__main__':
+    main()
diff --git a/bin/run_ncovtools.sh b/bin/run_ncovtools.sh
diff --git a/extra_data/config.yaml b/extra_data/config.yaml
@@ -39,7 +39,7 @@ variants_pattern: "{data_root}/{sample}.pass.vcf.gz"
 # in the tree by providing a fasta file here
 #tree_include_consensus: some_genomes_from_gisaid.fasta
 
-# some plots can by annotated with external metadata. this
+# some plots can by annotated with external metadata. This
 # file contains the metadata in simple tab-delimited format
 # one column must be 'sample'
 metadata: metadata.tsv

diff --git a/modules/schemes.nf b/modules/schemes.nf
@@ -0,0 +1,86 @@
+// Scheme related processes to go with scheme subworkflow
+process downloadScheme {
+    // Download primer scheme from given repo URL to primer-schemes directory
+    tag { params.schemeRepoURL }
+
+    output:
+    path "primer-schemes", emit: scheme
+
+    script:
+    """
+    git clone ${params.schemeRepoURL} primer-schemes
+    """
+}
+process validateScheme {
+    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "primer-schemes", mode: "copy"
+
+    input:
+    path scheme
+
+    output:
+    path "primer-schemes/${params.scheme}/${schemeFolder}/*.reference.fasta", emit: reference
+    path "primer-schemes/${params.scheme}/${schemeFolder}/*.primer.bed", emit: primer_bed // primer.bed for now as the repos have some conflicting scheme.bed
+    tuple val(schemeVersion), path("primer-schemes"), emit: scheme
+
+    script:
+    /*
+        ARTIC Requires the following:
+            - Scheme folder starting with a V (ex. Vfreed/)
+            - Scheme input also starting with a V (ex. Vfreed)
+
+        So a few checks to attempt to meet those requirements
+        Note that no def for the params as we need them in the outputs
+    */
+    schemeVersion = params.schemeVersion
+    schemeFolder = params.schemeVersion
+    if ( ! schemeVersion.startsWith("V") ) {
+        schemeVersion = "V" + schemeVersion
+        schemeFolder = "V" + schemeFolder
+    }
+    """
+    # Check for directory being called primer-schemes to match everything up
+    if [[ "$scheme" != "primer-schemes" ]]; then
+        mv $scheme primer-schemes
+    fi
+
+    # Adjust folder to make sure it starts with a V
+    #  If we can't find the V{schemeVersion} folder, then check for just {schemeVersion} folder and add the V
+    if [ ! -d primer-schemes/${params.scheme}/${schemeFolder} ]; then
+        if [ -d primer-schemes/${params.scheme}/${params.schemeVersion} ]; then
+            mv primer-schemes/${params.scheme}/${params.schemeVersion} primer-schemes/${params.scheme}/${schemeFolder}
+        else
+            echo "ERROR: Cannot find input scheme version ${params.schemeVersion} or adjusted ${schemeVersion}"
+            exit 1
+        fi
+    fi
+
+    # File checks
+    if [ ! -f primer-schemes/${params.scheme}/${schemeFolder}/*reference.fasta ]; then
+        echo "ERROR: Reference Fasta not found in 'primer-schemes/${params.scheme}/${schemeFolder}/*reference.fasta'"
+        exit 1
+    elif [ ! -f primer-schemes/${params.scheme}/${schemeFolder}/*scheme.bed ]; then
+        echo "ERROR: Scheme bed file not found in 'primer-schemes/${params.scheme}/${schemeFolder}/*primer.bed'"
+        exit 1
+    fi
+    """
+}
+process generateAmpliconBed {
+    publishDir "${params.outdir}/amplicon_info", pattern: "*.bed", mode: "copy"
+    publishDir "${params.outdir}/amplicon_info", pattern: "primer_prefix.txt", mode: "copy"
+
+    input:
+    path primer_bed
+
+    output:
+    path "amplicon.bed", emit: amplicon_bed
+    path "tiling_region.bed", emit: tiling_bed
+    env PREFIX, emit: primer_prefix
+
+    script:
+    """
+    primers_to_amplicons.py \\
+        --bed $primer_bed
+
+    PREFIX=\$(cat primer_prefix.txt)
+    """
+}
diff --git a/workflows/articNcovNanopore.nf b/workflows/articNcovNanopore.nf
@@ -40,6 +40,7 @@ include {collateSamples} from '../modules/upload.nf'
 
 // import subworkflows
 include {CLIMBrsync} from './upload.nf'
+include { schemeValidate } from './schemeValidate.nf'
 include {Genotyping} from './typing.nf'
 
 // workflow component for artic pipeline with nanopolish
@@ -57,6 +58,16 @@ workflow sequenceAnalysisNanopolish {
 
       articDownloadScheme()
 
+      // =============================== //
+      // Scheme and Reference
+      // =============================== //
+      schemeValidate()
+      ch_scheme = schemeValidate.out.scheme               // channel: [ val(scheme_version), path(scheme) ]
+      ch_reference = schemeValidate.out.reference         // channel: [ path(reference.fasta) ]
+      ch_primer_bed = schemeValidate.out.primer_bed       // channel: [ path(primer_bed) ]
+      ch_amplicon = schemeValidate.out.amplicon_bed       // channel: [ path(amplicon_bed) ]
+      ch_primer_prefix = schemeValidate.out.primer_prefix // channel: [ val(primer_prefix) ]
+
       // Tracking barcodes that fail initial read count checks
       accountNoReadsInput(ch_badFastqDirs.collect(),
                           ch_irida)
@@ -129,10 +140,10 @@ workflow sequenceAnalysisNanopolish {
              .set{ ch_ncov }
 
       runNcovTools(ch_ncov, 
-                      articDownloadScheme.out.reffasta, 
-                      articDownloadScheme.out.ncov_amplicon, 
+                      ch_reference, 
+                      ch_amplicon, 
                       articMinIONNanopolish.out[0].collect(),
-                      articDownloadScheme.out.bed,
+                      ch_primer_bed,
                       ch_irida,
                       ch_corrected)
       ch_versions = ch_versions.mix(runNcovTools.out.versions.first())

diff --git a/workflows/schemeValidate.nf b/workflows/schemeValidate.nf
@@ -0,0 +1,41 @@
+/*
+    Workflow to check, validate, and fix input primer schemes
+*/
+// Modules to include
+include {
+    downloadScheme ;
+    validateScheme ;
+    generateAmpliconBed
+} from '../modules/schemes.nf' 
+
+/*
+    Initialize channels from params
+*/
+ch_scheme = params.local_scheme ? file(params.local_scheme, type: 'dir', checkIfExists: true) : []
+
+// Workflow
+workflow schemeValidate {
+    main:
+    // =============================== //
+    // Pull scheme from online
+    // =============================== //
+    if ( ! ch_scheme ) {
+        downloadScheme()
+        ch_scheme = downloadScheme.out.scheme
+    }
+
+    validateScheme(
+        ch_scheme
+    )
+
+    generateAmpliconBed(
+        validateScheme.out.primer_bed
+    )
+
+    emit:
+    scheme = validateScheme.out.scheme
+    reference = validateScheme.out.reference
+    primer_bed = validateScheme.out.primer_bed
+    amplicon_bed = generateAmpliconBed.out.amplicon_bed
+    primer_prefix = generateAmpliconBed.out.primer_prefix
+}