From ebc6f31528366fddf8238a003e5a44625360ccc9 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 27 May 2024 22:35:22 +0200 Subject: [PATCH 1/3] Add VEP annotation to all variants --- .../snvPipeline/annotate_with_VEP.sh | 33 +++++ .../analysisTools/snvPipeline/filter_vcf.sh | 8 ++ .../snvPipeline/parse_VEP_annotations.py | 131 ++++++++++++++++++ .../configurationFiles/analysisSNVCalling.xml | 17 ++- 4 files changed, 184 insertions(+), 5 deletions(-) create mode 100755 resources/analysisTools/snvPipeline/annotate_with_VEP.sh create mode 100755 resources/analysisTools/snvPipeline/parse_VEP_annotations.py diff --git a/resources/analysisTools/snvPipeline/annotate_with_VEP.sh b/resources/analysisTools/snvPipeline/annotate_with_VEP.sh new file mode 100755 index 0000000..1539ede --- /dev/null +++ b/resources/analysisTools/snvPipeline/annotate_with_VEP.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +## Annotate the variants with VEP + +## To run +## LOCAL: sh annotate_variant_with_VEP.sh snvs_{pid}_somatic_snvs_conf_8_to_10.vcf snvs_{pid}_somatic_snvs_conf_8_to_10.VEP.vcf + +vep_species="homo_sapiens" +vep_assembly="GRCh37" +vep_out_format="vcf" + +input_vcf=${1} +output_vcf=${2} +threads=${VEP_FORKS} + +## Annotate the high confidence variants +## Parse for the functional consequences +${PERL_BINARY} ${VEP_SW_PATH} \ + --input_file $input_vcf \ + --species $vep_species \ + --assembly $vep_assembly \ + --output_file STDOUT \ + --format $vep_out_format \ + --fork $threads \ + --fasta ${VEP_FA_INDEX} \ + --everything \ + --vcf \ + --cache \ + --offline \ + --force_overwrite \ + --no_stats \ + --dir_cache ${VEP_CACHE_BASE} \ + | ${PYTHON_BINARY} ${TOOL_PARSE_VEP} | ${BGZIP_BINARY} -f > $output_vcf \ No newline at end of file diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh index c146034..e6c01a0 100755 --- a/resources/analysisTools/snvPipeline/filter_vcf.sh +++ b/resources/analysisTools/snvPipeline/filter_vcf.sh @@ -40,6 +40,14 @@ filenameSeqContextTab=${outputFilenamePrefix}_snvs_with_context_conf_${MIN_CONFI filenameMAFconfPlot=${outputFilenamePrefix}_MAF_conf_${MIN_CONFIDENCE_SCORE}_to_10.pdf filenameSnvDiagnosticsPlot=${outputFilenamePrefix}_allSNVdiagnosticsPlots.pdf +## Run VEP on the somatic high confidence SNVs +${TOOL_ANNOTATE_VEP} ${FILENAME_VCF} ${FILENAME_VCF}.tmp +[[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 8 + +# Overwrite the original VCF file with the VEP annotated one +mv ${FILENAME_VCF}.tmp ${FILENAME_VCF} && ${TABIX_BINARY} -f -p vcf ${FILENAME_VCF} +[[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 9 + ${PERL_BINARY} ${TOOL_SNV_EXTRACTOR} --infile=${FILENAME_VCF} --minconf=${MIN_CONFIDENCE_SCORE} --pid=${outputFilenamePrefix} --bgzip=${BGZIP_BINARY} --tabix=${TABIX_BINARY} ${SNV_FILTER_OPTIONS} [[ "$?" != 0 ]] && echo "There was a non-zero exit code in the somatic file and dbSNP counting pipe" && exit 1 diff --git a/resources/analysisTools/snvPipeline/parse_VEP_annotations.py b/resources/analysisTools/snvPipeline/parse_VEP_annotations.py new file mode 100755 index 0000000..44db802 --- /dev/null +++ b/resources/analysisTools/snvPipeline/parse_VEP_annotations.py @@ -0,0 +1,131 @@ +""" +parse_VEP_annotations.py + +This script parses VEP annotations from the input and writes the parsed output to STDOUT. +It contains functions to parse VEP format, gene consequences and HGVSc annotations, and format transcript information. + +Usage: + cat VEP_annotated.vcf | python parse_VEP_annotations.py > VEP_annotated_parsed.vcf + +""" + +import sys + +def parse_vep_annotations(): + """ + Parses VEP annotations from the input and writes the parsed output to STDOUT. + + This function reads input from `sys.stdin` line by line and processes each line. + If a line starts with "#" and contains "##INFO= 0: + transcript_info = "" + for gene in gene_consequence_hgvsc_ds: + for consequence in gene_consequence_hgvsc_ds[gene]: + transcript_info += "{0}|{1}({2});".format(gene, consequence, ','.join(gene_consequence_hgvsc_ds[gene][consequence])) + line = line + "\t" + transcript_info + "\n" + else: + line = line + "\t.\n" + return line + + +if __name__ == "__main__": + + # Parse VEP annotations and write to STDOUT + parse_vep_annotations() diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index 8c0f55e..36cd85d 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -67,6 +67,11 @@ + + + + + @@ -153,7 +158,7 @@ - + @@ -182,10 +187,10 @@ - - - - + + + + @@ -227,6 +232,8 @@ + + From e46b90eb4e802a5f55a8fcf66289bcd9a1882bf4 Mon Sep 17 00:00:00 2001 From: paramasi Date: Tue, 28 May 2024 13:46:51 +0200 Subject: [PATCH 2/3] Remove the empty line after header --- resources/analysisTools/snvPipeline/filter_vcf.sh | 1 + resources/analysisTools/snvPipeline/parse_VEP_annotations.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh index e6c01a0..5f0031b 100755 --- a/resources/analysisTools/snvPipeline/filter_vcf.sh +++ b/resources/analysisTools/snvPipeline/filter_vcf.sh @@ -45,6 +45,7 @@ ${TOOL_ANNOTATE_VEP} ${FILENAME_VCF} ${FILENAME_VCF}.tmp [[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 8 # Overwrite the original VCF file with the VEP annotated one +# NOTE: If there is an error here, one has to rerun the Annotation and DeepAnnotation steps mv ${FILENAME_VCF}.tmp ${FILENAME_VCF} && ${TABIX_BINARY} -f -p vcf ${FILENAME_VCF} [[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 9 diff --git a/resources/analysisTools/snvPipeline/parse_VEP_annotations.py b/resources/analysisTools/snvPipeline/parse_VEP_annotations.py index 44db802..ae1138e 100755 --- a/resources/analysisTools/snvPipeline/parse_VEP_annotations.py +++ b/resources/analysisTools/snvPipeline/parse_VEP_annotations.py @@ -38,7 +38,7 @@ def parse_vep_annotations(): if line.startswith("##INFO= Date: Fri, 31 May 2024 11:21:36 +0200 Subject: [PATCH 3/3] Update VEP colname, add hgvsp & exon/intron numbers --- .../analysisTools/snvPipeline/filter_vcf.sh | 2 ++ .../snvPipeline/parse_VEP_annotations.py | 26 +++++++++++++++---- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh index 5f0031b..04a9fe2 100755 --- a/resources/analysisTools/snvPipeline/filter_vcf.sh +++ b/resources/analysisTools/snvPipeline/filter_vcf.sh @@ -40,6 +40,7 @@ filenameSeqContextTab=${outputFilenamePrefix}_snvs_with_context_conf_${MIN_CONFI filenameMAFconfPlot=${outputFilenamePrefix}_MAF_conf_${MIN_CONFIDENCE_SCORE}_to_10.pdf filenameSnvDiagnosticsPlot=${outputFilenamePrefix}_allSNVdiagnosticsPlots.pdf +########################################## VEP annotation ####################################### ## Run VEP on the somatic high confidence SNVs ${TOOL_ANNOTATE_VEP} ${FILENAME_VCF} ${FILENAME_VCF}.tmp [[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 8 @@ -49,6 +50,7 @@ ${TOOL_ANNOTATE_VEP} ${FILENAME_VCF} ${FILENAME_VCF}.tmp mv ${FILENAME_VCF}.tmp ${FILENAME_VCF} && ${TABIX_BINARY} -f -p vcf ${FILENAME_VCF} [[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 9 +########################################## Filter ############################################### ${PERL_BINARY} ${TOOL_SNV_EXTRACTOR} --infile=${FILENAME_VCF} --minconf=${MIN_CONFIDENCE_SCORE} --pid=${outputFilenamePrefix} --bgzip=${BGZIP_BINARY} --tabix=${TABIX_BINARY} ${SNV_FILTER_OPTIONS} [[ "$?" != 0 ]] && echo "There was a non-zero exit code in the somatic file and dbSNP counting pipe" && exit 1 diff --git a/resources/analysisTools/snvPipeline/parse_VEP_annotations.py b/resources/analysisTools/snvPipeline/parse_VEP_annotations.py index ae1138e..642347b 100755 --- a/resources/analysisTools/snvPipeline/parse_VEP_annotations.py +++ b/resources/analysisTools/snvPipeline/parse_VEP_annotations.py @@ -38,7 +38,7 @@ def parse_vep_annotations(): if line.startswith("##INFO=