DKFZ-ODCF · NagaComBio · May 27, 2024 · May 28, 2024 · May 31, 2024 · vinjana
diff --git a/resources/analysisTools/snvPipeline/annotate_with_VEP.sh b/resources/analysisTools/snvPipeline/annotate_with_VEP.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+## Annotate the variants with VEP
+
+## To run
+## LOCAL: sh annotate_variant_with_VEP.sh snvs_{pid}_somatic_snvs_conf_8_to_10.vcf snvs_{pid}_somatic_snvs_conf_8_to_10.VEP.vcf
+
+vep_species="homo_sapiens"
+vep_assembly="GRCh37"
+vep_out_format="vcf"
+
+input_vcf=${1}
+output_vcf=${2}
+threads=${VEP_FORKS}
+
+## Annotate the high confidence variants
+## Parse for the functional consequences
+${PERL_BINARY} ${VEP_SW_PATH} \
+  --input_file $input_vcf \
+  --species $vep_species \
+  --assembly $vep_assembly \
+  --output_file STDOUT \
+  --format $vep_out_format \
+  --fork $threads \
+  --fasta ${VEP_FA_INDEX} \
+  --everything \
+  --vcf \
+  --cache \
+  --offline \
+  --force_overwrite \
+  --no_stats \
+  --dir_cache ${VEP_CACHE_BASE} \
+  | ${PYTHON_BINARY} ${TOOL_PARSE_VEP} | ${BGZIP_BINARY} -f > $output_vcf
diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh
@@ -40,6 +40,17 @@ filenameSeqContextTab=${outputFilenamePrefix}_snvs_with_context_conf_${MIN_CONFI
 filenameMAFconfPlot=${outputFilenamePrefix}_MAF_conf_${MIN_CONFIDENCE_SCORE}_to_10.pdf
 filenameSnvDiagnosticsPlot=${outputFilenamePrefix}_allSNVdiagnosticsPlots.pdf
 
+########################################## VEP annotation #######################################
+## Run VEP on the somatic high confidence SNVs
+${TOOL_ANNOTATE_VEP} ${FILENAME_VCF} ${FILENAME_VCF}.tmp
+[[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 8
+
+# Overwrite the original VCF file with the VEP annotated one
+# NOTE: If there is an error here, one has to rerun the Annotation and DeepAnnotation steps
+mv ${FILENAME_VCF}.tmp ${FILENAME_VCF} && ${TABIX_BINARY} -f -p vcf ${FILENAME_VCF}
+[[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 9
+
+########################################## Filter ###############################################
 ${PERL_BINARY} ${TOOL_SNV_EXTRACTOR} --infile=${FILENAME_VCF} --minconf=${MIN_CONFIDENCE_SCORE} --pid=${outputFilenamePrefix} --bgzip=${BGZIP_BINARY} --tabix=${TABIX_BINARY} ${SNV_FILTER_OPTIONS}
 [[ "$?" != 0 ]] && echo "There was a non-zero exit code in the somatic file and dbSNP counting pipe" && exit 1
 

diff --git a/resources/analysisTools/snvPipeline/parse_VEP_annotations.py b/resources/analysisTools/snvPipeline/parse_VEP_annotations.py
@@ -0,0 +1,147 @@
+"""
+parse_VEP_annotations.py
+
+This script parses VEP annotations from the input and writes the parsed output to STDOUT.
+It contains functions to parse VEP format, gene consequences and HGVSc annotations, and format transcript information.
+
+Usage:
+    cat VEP_annotated.vcf | python parse_VEP_annotations.py > VEP_annotated_parsed.vcf
+
+"""
+
+import sys
+
+def parse_vep_annotations():
+    """
+    Parses VEP annotations from the input and writes the parsed output to STDOUT.
+
+    This function reads input from `sys.stdin` line by line and processes each line.
+    If a line starts with "#" and contains "##INFO=<ID=CSQ", it extracts the VEP format.
+    If a line starts with "#CHROM", it appends the header for the VEP gene consequence and HGVSc columns.
+    For all other lines, it splits the line by tab and extracts the info field.
+    The info field is then split by semicolon and parsed to extract gene consequence and HGVSc information.
+    Finally, the transcript information is formatted and written to STDOUT.
+
+    Args:
+        None
+
+    Returns:
+        None
+    """
+    vep_format = {}
+
+    for line in sys.stdin:
+        line = line.strip()
+        gene_consequence_hgvsc_ds = {}
+
+        if line.startswith("#"):
+            if line.startswith("##INFO=<ID=CSQ"):
+                vep_format = parse_vep_format(line)
+            if line.startswith("#CHROM"):
+                line = line + "\tVEP_TRANSCRIPTS"
+            # write to STDOUT
+            sys.stdout.write("{0}\n".format(line))
+        else:
+            variant_infos = line.split("\t")
+            info = variant_infos[7]
+            info = info.split(";")
+            gene_consequence_hgvsc_ds = parse_gene_consequence_hgvsc(info, vep_format, gene_consequence_hgvsc_ds)
+            line = format_transcript_info(line, gene_consequence_hgvsc_ds)
+
+            # write to STDOUT
+            sys.stdout.write(line)
+
+def parse_vep_format(line):
+    """
+    Parses the VEP format line and returns a dictionary mapping each field to its index.
+
+    Args:
+        line (str): The VEP format line.
+
+    Returns:
+        dict: A dictionary mapping each field to its index.
+    """
+    vep_format = {}
+    vep_format_line = line.split("Format: ")[1]
+    vep_format_line = vep_format_line.split("|")
+    for i, field in enumerate(vep_format_line):
+        vep_format[field] = i
+    return vep_format
+
+def parse_gene_consequence_hgvsc(info, vep_format, gene_consequence_hgvsc_ds):
+    """
+    Parses gene consequences and HGVSc annotations from VEP annotations.
+
+    Args:
+        info (list): List of VEP annotations.
+        vep_format (dict): Dictionary mapping VEP annotation fields to their indices.
+        gene_consequence_hgvsc_ds (dict): Dictionary to store gene consequences and HGVSc annotations.
+
+    Returns:
+        dict: Updated gene_consequence_hgvsc_ds dictionary with gene consequences and HGVSc annotations.
+
+    """
+    for i in info:
+        if i.startswith("CSQ="):
+            i = i.replace("CSQ=", "")
+            i = i.split(",")
+            for j in i:
+                j = j.split("|")
+                gene_name = j[vep_format["SYMBOL"]]
+                consequence = j[vep_format["Consequence"]]
+                transcript = j[vep_format["Feature"]]
+                hgvsc = j[vep_format["HGVSc"]].split(":")[1] if j[vep_format["HGVSc"]] != "" else ""
+                hgvsp = j[vep_format["HGVSp"]].split(":")[1] if j[vep_format["HGVSp"]] != "" else ""
+                exon = j[vep_format["EXON"]].split("/")[0] if j[vep_format["EXON"]] != "" else ""
+                intron = j[vep_format["INTRON"]].split("/")[0] if j[vep_format["INTRON"]] != "" else ""
+
+                if exon != "" and intron == "":
+                    gene_part = "exon" + exon
+                elif intron != "" and exon == "":
+                    gene_part = "intron" + intron
+                else:
+                    gene_part = "exon" + exon + "-intron" + intron
+
+                biotype = j[vep_format["BIOTYPE"]]
+                impact = j[vep_format["IMPACT"]]
+
+                # Format transcript information with exon number, HGVSc, and HGVSp
+                tran_exon_hgvsc_p = "{0}:{1}:{2}:{3}".format(transcript, gene_part, hgvsc, hgvsp)
+
+                if biotype == "protein_coding" and impact in ["HIGH", "MODERATE"]:
+                    if gene_name in gene_consequence_hgvsc_ds:
+                        if consequence in gene_consequence_hgvsc_ds[gene_name]:
+                            gene_consequence_hgvsc_ds[gene_name][consequence].append(tran_exon_hgvsc_p)
+                        else:
+                            gene_consequence_hgvsc_ds[gene_name][consequence] = [tran_exon_hgvsc_p]
+                    else:
+                        gene_consequence_hgvsc_ds[gene_name] = {consequence: [tran_exon_hgvsc_p]}
+    return gene_consequence_hgvsc_ds
+
+def format_transcript_info(line, gene_consequence_hgvsc_ds):
+    """
+    Formats the transcript information based on the gene, consequence, and HGVSc data.
+
+    Args:
+        line (str): The input line to be formatted.
+        gene_consequence_hgvsc_ds (dict): A dictionary containing gene, consequence, and HGVSc data.
+
+    Returns:
+        str: The formatted line with transcript information.
+
+    """
+    if len(gene_consequence_hgvsc_ds) > 0:
+        transcript_info = ""
+        for gene in gene_consequence_hgvsc_ds:
+            for consequence in gene_consequence_hgvsc_ds[gene]:
+                transcript_info += "{0}|{1}({2});".format(gene, consequence, ','.join(gene_consequence_hgvsc_ds[gene][consequence]))
+        line = line + "\t" + transcript_info + "\n"
+    else:
+        line = line + "\t.\n"
+    return line
+
+
+if __name__ == "__main__":
+
+    # Parse VEP annotations and write to STDOUT
+    parse_vep_annotations()
diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
@@ -67,6 +67,11 @@
         <cvalue name="maxNumOppositeReadsSequenceStrongBias" value="1" type="integer"/>
         <cvalue name="rVcf" value="0.1" type="float" description="max fraction of reads (reads/total reads) we tolerate in the direction opposite to the bias for it to be called an artefact"/>
 
+        <cvalue name='VEP_VERSION' value='110.1' type='string' />
+        <cvalue name='VEP_SW_PATH' value='/software/vep/${VEP_VERSION}/vep' type='path' />
+        <cvalue name='VEP_FORKS' value='3' type='integer'/>
+        <cvalue name='VEP_FA_INDEX' value='${assembliesHG191000GenomesDirectory}/tools_data/germlineSmVs/VEP/reference/hs37d5_PhiX.fa' type ='path' />
+        <cvalue name='VEP_CACHE_BASE' value='/omics/odcf/reference_data/by-tool/vep' type='path' />
 
         <configurationValueBundle name='PIPE_CONFIG:SNV_RELIABILITY'>
             <cvalue name='MAPABILITY' value='${hg19DatabaseUCSCDirectory}/wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz' type="path"/>
@@ -153,7 +158,7 @@
             <resourcesets>
                 <rset size="t" memory="1" cores="1" nodes="1" walltime="1" queue="devel"/>
                 <rset size="s" memory="3" cores="2" nodes="1" walltime="4"/>
-                <rset size="m" memory="3" cores="1" nodes="1" walltime="12"/>
+                <rset size="m" memory="3" cores="2" nodes="1" walltime="12"/>
                 <rset size="l" memory="3" cores="2" nodes="1" walltime="120"/>
                 <rset size="xl" memory="4" cores="2" nodes="1" walltime="180"/>
             </resourcesets>
@@ -182,10 +187,10 @@
         <tool name="snvFilter" value="filter_vcf.sh" basepath="snvPipeline">
             <resourcesets>
                 <rset size="t" memory="1" cores="1" nodes="1" walltime="1" queue="devel"/>
-                <rset size="s" memory="1" cores="1" nodes="1" walltime="1"/>
-                <rset size="m" memory="1" cores="1" nodes="1" walltime="2"/>
-                <rset size="l" memory="1" cores="1" nodes="1" walltime="3"/>
-                <rset size="xl" memory="4" cores="1" nodes="1" walltime="4"/>
+                <rset size="s" memory="3" cores="3" nodes="1" walltime="7"/>
+                <rset size="m" memory="3" cores="3" nodes="1" walltime="9"/>
+                <rset size="l" memory="3" cores="3" nodes="1" walltime="11"/>
+                <rset size="xl" memory="4" cores="3" nodes="1" walltime="13"/>
             </resourcesets>
             <input type="file" typeof="de.dkfz.b080.co.files.SNVAnnotationFile" scriptparameter="FILENAME_VCF"/>
             <input type="file" typeof="de.dkfz.b080.co.files.SNVAnnotationFile" scriptparameter="FILENAME_RAW_VCF"/>
@@ -227,6 +232,8 @@
         <tool name="seqContextAnnotator" value="seqContext_annotator.pl" basepath="tools"/>
         <tool name="processAnnovar" value="processAnnovarOutput.pl" basepath="tools"/>
         <tool name="newColsToVcf" value="newCols2vcf.pl" basepath="tools"/>
+        <tool name="annotateVep" value="annotate_with_VEP.sh" basepath="snvPipeline" />
+        <tool name="parseVep" value="parse_VEP_annotations.py" basepath="snvPipeline" />
 
         <tool name="analyzeBamHeader" value="analyzeBamHeader.sh" basepath="tools"/>
     </processingTools>