From ba8b0f5e5ed316f52205ac03cfb12a4a0541f1a0 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Mon, 19 Aug 2019 09:38:47 +0200 Subject: [PATCH 01/49] Removing ExAC and EVS from annotation and no-control filtering --- .../snvPipeline/confidenceAnnotation_SNVs.py | 15 +-------------- .../analysisTools/snvPipeline/snvAnnotation.sh | 4 +--- .../configurationFiles/analysisSNVCalling.xml | 8 -------- 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index f13d346..df0e0fa 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -105,8 +105,6 @@ def main(args): variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", } if args.no_control: - variable_headers["ExAC_COL"] = "^ExAC$" - variable_headers["EVS_COL"] = "^EVS$" variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$" variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$" variable_headers["LOCALCONTROL_COL"] = "^LocalControlAF$" @@ -187,8 +185,6 @@ def main(args): is_weird = False # coindicende with known artefact-producing regions if args.no_control: classification = "somatic" # start with default somatic - inExAC = False - inEVS = False inGnomAD_WES = False inGnomAD_WGS = False inLocalControl = False @@ -234,15 +230,6 @@ def main(args): if args.no_control: if indbSNP and is_commonSNP and not is_clinic: reasons += "dbSNP(NoControl)" - if help["ExAC_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["ExAC_COL"], "AF").split(','))): - inExAC = True - infofield["ExAC"] = "ExAC" - reasons += "ExAC(NoControl)" - if help["EVS_COL_VALID"] and any(af > 1.0 for af in map(float, extract_info(help["EVS_COL"], "MAF").split(','))): - inEVS = True - infofield["EVS"] = "EVS" - reasons += "EVS(NoControl)" - if help["GNOMAD_EXOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): inGnomAD_WES = True infofield["gnomAD_Exomes"] = "gnomAD_Exomes" @@ -397,7 +384,7 @@ def main(args): if (args.no_control): # an SNV that is in dbSNP but not "clinic" or/and in 1 KG with high frequency is probably germline - if (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inExAC or inEVS or inGnomAD_WES or inGnomAD_WGS or inLocalControl): + if (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl): classification = "SNP_support_germline" # 3) information from the calls and germline comparisons: coverage, strand bias, variant support, .. diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh index 7d62a8e..d3c97f5 100755 --- a/resources/analysisTools/snvPipeline/snvAnnotation.sh +++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh @@ -90,9 +90,7 @@ if [[ ${isNoControlWorkflow-false} == "false" ]]; then fi cmdFilter="${cmdFilter} | ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${DBSNP} --columnName=${DBSNP_COL} --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${KGENOME} --columnName=${KGENOMES_COL} --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${ExAC} --columnName ${ExAC_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${EVS} --columnName ${EVS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ + ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${KGENOME} --columnName=${KGENOMES_COL} --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_ALL_SNV} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_ALL_SNV} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL} --columnName ${LOCALCONTROL_COL} --bFileType vcf --reportMatchType --reportLevel 4 " diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index 30a9dc3..e55d168 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -47,10 +47,6 @@ - - - - @@ -76,8 +72,6 @@ - - @@ -87,8 +81,6 @@ - - From 8584941dc1a61cc7c45f87ca847b97a8e9b4fc13 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Fri, 11 Oct 2019 12:05:56 +0200 Subject: [PATCH 02/49] Removing DUKE, DAC and HiSeqDepth based confidence annotaions --- .../snvPipeline/confidenceAnnotation_SNVs.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index df0e0fa..0e2aa0b 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -54,7 +54,7 @@ def main(args): '##FORMAT=\n' \ '##FORMAT=\n' \ '##FILTER=\n' \ - '##FILTER=\n' \ + '##FILTER=\n' \ '##FILTER=\n' \ '##FILTER=\n' \ '##FILTER=\n' \ @@ -98,8 +98,8 @@ def main(args): if line[0] == "#": headers = list(line[1:].rstrip().split('\t')) - fixed_headers = ["^INFO$", "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", "DUKE_EXCLUDED", - "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$", + fixed_headers = ["^INFO$", "MAPABILITY", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", + "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$", "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$", "^seqBiasPresent_2$", "^seqingBiasPresent_2$"] variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", } @@ -321,19 +321,19 @@ def main(args): filterfield["RE"] = 1 # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat - if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]: - confidence -= 3 # really bad region, usually centromeric repeats - is_weird = True - reasons += "Blacklist(-3)" - filterfield["BL"] = 1 + #if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]: + # confidence -= 3 # really bad region, usually centromeric repeats + # is_weird = True + # reasons += "Blacklist(-3)" + # filterfield["BL"] = 1 # HiSeqDepth: regions "attracting" reads; often coincide with tandem repeats and CEN/TEL, # not always with low mapability - if help["HISEQDEPTH_VALID"]: - confidence -= 3 # really really bad region! - is_weird = True - reasons += "Hiseqdepth(-3)" - filterfield["HSDEPTH"] = 1 + #if help["HISEQDEPTH_VALID"]: + # confidence -= 3 # really really bad region! + # is_weird = True + # reasons += "Hiseqdepth(-3)" + # filterfield["HSDEPTH"] = 1 # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ... # Everything with really high number of occurences is artefacts From 7e49b5d9fa45165d440f84cb222097e43515867b Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Fri, 11 Oct 2019 12:07:50 +0200 Subject: [PATCH 03/49] Removing getRefGenomeAndChrPrefixFromHeader functions --- resources/analysisTools/snvPipeline/filter_vcf.sh | 4 ++-- resources/analysisTools/snvPipeline/snvAnnotation.sh | 6 +++--- resources/analysisTools/snvPipeline/snvCalling.sh | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh index dc71e1e..1d57f0e 100755 --- a/resources/analysisTools/snvPipeline/filter_vcf.sh +++ b/resources/analysisTools/snvPipeline/filter_vcf.sh @@ -19,8 +19,8 @@ mpileupDirectory=`dirname ${FILENAME_VCF}` RUN_PLOTS=${RUN_PLOTS-1} RUN_PUREST=${RUN_PUREST-1} -source ${TOOL_ANALYZE_BAM_HEADER} -getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME +#source ${TOOL_ANALYZE_BAM_HEADER} +#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME ALIGNMENT_FOLDER=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}` # bugfix: ensure to interpret CHROMOSOME_INDICES as array - otherwise TOOL_INTERMUTATION_DISTANCE_COORD_COLOR will fail... diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh index d3c97f5..5a121bd 100755 --- a/resources/analysisTools/snvPipeline/snvAnnotation.sh +++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh @@ -36,8 +36,8 @@ scriptDirectory=`dirname ${WRAPPED_SCRIPT}` #TOOLS_DIR=${BASEPATH_TOOLS} FILEBASE=`dirname ${FILENAME_VCF_IN}`/`basename ${FILENAME_VCF_IN} .vcf` -source ${TOOL_ANALYZE_BAM_HEADER} -getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME +#source ${TOOL_ANALYZE_BAM_HEADER} +#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME [[ -f ${FILENAME_CHECKPOINT} ]] && rm ${FILENAME_CHECKPOINT} @@ -90,7 +90,7 @@ if [[ ${isNoControlWorkflow-false} == "false" ]]; then fi cmdFilter="${cmdFilter} | ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${DBSNP} --columnName=${DBSNP_COL} --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${KGENOME} --columnName=${KGENOMES_COL} --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \ + ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${KGENOME} --columnName=${KGENOMES_COL} --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_ALL_SNV} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_ALL_SNV} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL} --columnName ${LOCALCONTROL_COL} --bFileType vcf --reportMatchType --reportLevel 4 " diff --git a/resources/analysisTools/snvPipeline/snvCalling.sh b/resources/analysisTools/snvPipeline/snvCalling.sh index de88566..92f1113 100755 --- a/resources/analysisTools/snvPipeline/snvCalling.sh +++ b/resources/analysisTools/snvPipeline/snvCalling.sh @@ -24,8 +24,8 @@ fi PARM_CHR_INDEX=${PARM_CHR_INDEX-} # Ignore chr prefix and set it manually -source ${TOOL_ANALYZE_BAM_HEADER} -getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME +#source ${TOOL_ANALYZE_BAM_HEADER} +#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME chr="" if [[ -n "$PARM_CHR_INDEX" ]] From ce97964057460c23f13dee706472a95cbdb15799 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Fri, 11 Oct 2019 12:08:56 +0200 Subject: [PATCH 04/49] Updating annotation file paths to hg38 directory --- .../configurationFiles/analysisSNVCalling.xml | 81 ++++++++++--------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index e55d168..7b1f92e 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -41,24 +41,29 @@ + + + + + + - + - + - + - - - - + + - - - - + + + + + @@ -123,13 +128,13 @@ - - - - - - - + + + + + + + @@ -145,17 +150,17 @@ # see also the usage report of ngs2/trunk/tools/annotate_vcf.pl # if any whitespaces are required, use quotation marks like in this example: #MAPABILITY="/ibios/co02/annotation/hg19/wgEncodeCrgMapabilityAlign100mer.bedGraph.gz:::-breakPointMode -aEndOffset=1"--> - - - - - - - - - - - + + + + + + + + + + + @@ -196,8 +201,8 @@ - - + + @@ -223,9 +228,9 @@ - - - + + + @@ -239,10 +244,10 @@ - - - - + + + + From c5142972e37c1e916d865108b9726306d749aee3 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Tue, 21 Jan 2020 14:39:32 +0100 Subject: [PATCH 05/49] Removing the hg38 encode blacklist filter --- .../snvPipeline/confidenceAnnotation_SNVs.py | 7 +++++++ resources/configurationFiles/analysisSNVCalling.xml | 11 ++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index 0e2aa0b..6c8d22c 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -320,6 +320,13 @@ def main(args): reasons += "Segmental_dup(-2)" filterfield["RE"] = 1 + # Encode blacklist, generated for hg38 mappings +# if help["ENCODE_BLACKLIST_VALID"]: +# confidence -= 3 # High signal region and low mappability +# is_weird = True +# reasons += "Blacklist(-3)" +# filterfield["BL"] = 1 + # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat #if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]: # confidence -= 3 # really bad region, usually centromeric repeats diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index 7b1f92e..bff9649 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -42,10 +42,10 @@ - - - - + + + + @@ -56,7 +56,7 @@ - + @@ -132,6 +132,7 @@ + From 3bebd1f64f59b0a75b298255d81d71c47227e11f Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 7 Jan 2021 11:38:26 +0100 Subject: [PATCH 06/49] Variant calling from CRAM files --- resources/analysisTools/snvPipeline/filter_PEoverlap.py | 6 +++--- resources/analysisTools/snvPipeline/snvAnnotation.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py index b8c06c1..6a4bd18 100755 --- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py +++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py @@ -123,15 +123,15 @@ def performAnalysis(args): #vcfInFile = open(args.inf, "r") #outFile = open(args.outf, "w") - # Reference file for BAQ_recalcuation and local realignment - reference_file = pysam.Fastafile(args.refFileName) + # Reference file for CRAM files + reference_file = args.refFileName mode = "r" if args.alignmentFile[-4:] == ".bam": mode += "b" elif args.alignmentFile[-5:] == ".cram": mode += "c" - samfile = pysam.Samfile(args.alignmentFile, mode) # This should work for BAM file only (with random access). + samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file) # This should work for BAM file only (with random access). if args.altPosF != '': ALT_basePositions_file = args.altPosF diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh index 5a121bd..4400766 100755 --- a/resources/analysisTools/snvPipeline/snvAnnotation.sh +++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh @@ -17,7 +17,7 @@ declare -r outputDirectory=`dirname ${FILENAME_VCF_OUT}` relinked=false # (Re-)link bam file and index. The pipeline cannot handle i.e. .mdup.bam and .mdup.bai # Look up the index with .bam.bai. If the file exists nothing happens. -if [[ ! -f ${TUMOR_BAMFILE_FULLPATH_BP}.bai ]] +if [[ ! -f ${TUMOR_BAMFILE_FULLPATH_BP}.bai && ! -f ${TUMOR_BAMFILE_FULLPATH_BP}.crai ]] then shortBamIdx=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}`"/"`basename ${TUMOR_BAMFILE_FULLPATH_BP} .bam`".bai" [[ ! -f ${shortBamIdx} ]] && echo "Bam index file cannot be found!" && exit -15 From b0ac261f6b9edb811e784bc352071bfa45f91a82 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 11 Feb 2021 13:14:58 +0100 Subject: [PATCH 07/49] Checking for nonREFnonALT in BoolCounter class --- .../snvPipeline/filter_PEoverlap.py | 72 +++++++++++-------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py index 6a4bd18..d6f6a97 100755 --- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py +++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py @@ -90,24 +90,29 @@ def decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, D if remove_is_reverse: if DP4ar > 0: DP4ar -= 1 else: - if DP4af > 0: DP4af -= 1 + if DP4af > 0: DP4af -= 1 return(DP4rf, DP4rr, DP4af, DP4ar) class BoolCounter: """ A class for counter objects """ - def __init__(self): - self.ref = 0 - self.alt = 0 + def __init__(self, ref, alt): + self.REF = ref + self.ALT = alt + self.ref_count = 0 + self.alt_count = 0 - def update(self, remove): - if (remove): - self.ref += 1 - else: - self.alt += 1 + def update(self, current_base): + if self.REF == current_base: + self.ref_count += 1 + elif self.ALT == current_base: + self.alt_count += 1 + + def update_nonREFnonALT(self, remove_count): + self.alt_count =+ remove_count def counted(self): - return [self.ref, self.alt] + return [self.ref_count, self.alt_count] # MAIN ANALYSIS PROCEDURE def performAnalysis(args): @@ -129,9 +134,12 @@ def performAnalysis(args): mode = "r" if args.alignmentFile[-4:] == ".bam": mode += "b" + multiple_iterators = False + samfile = pysam.Samfile(args.alignmentFile, mode) elif args.alignmentFile[-5:] == ".cram": mode += "c" - samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file) # This should work for BAM file only (with random access). + multiple_iterators = False + samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file) if args.altPosF != '': ALT_basePositions_file = args.altPosF @@ -195,12 +203,13 @@ def performAnalysis(args): ACGTNacgtn1 = [0]*10 ACGTNacgtn2 = [0]*10 - count_PE = BoolCounter() # Starting the counter for the forward and reverse reads removed due to PE overlap detection - count_supple = BoolCounter() # "" for supplementary reads, since flag_filter is added, entire supplementary detection can be removed in future versions - count_mismatch = BoolCounter() # " for mismatch report + count_PE = BoolCounter(REF, ALT) # Starting the counter for the forward and reverse reads removed due to PE overlap detection + count_supple = BoolCounter(REF, ALT) # "" for supplementary reads, since flag_filter is added, entire supplementary detection can be removed in future versions + count_mismatch = BoolCounter(REF, ALT) # " for mismatch report + count_nonREFnonALT = BoolCounter(REF, ALT) # " for nonREF and nonALT bases # To match pysam and mpileup counts, a reference file is added. Given the reference file, Pysam by default computes BAQ (compute_baq). - for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=3844, redo_baq=True, ignore_overlaps=False): + for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=3844, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators): if pileupcolumn.pos == (pos-1): #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments) for pileupread in pileupcolumn.pileups: @@ -244,7 +253,7 @@ def performAnalysis(args): if numberOfMismatches > args.allowedNumberOfMismatches: remove_base = pileupread.alignment.seq[pileupread.query_position] remove_is_reverse = pileupread.alignment.is_reverse - count_mismatch.update(remove_base == REF) + count_mismatch.update(remove_base) (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) # after decreasing the respective DP4 value, go directly to the next read # without remembering the current read @@ -277,7 +286,6 @@ def performAnalysis(args): #if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: # DEBUG July 23 2012: BROAD BAM problem due to pileupread.alignment.qqual being shorter sometimes than pileupread.alignment.qual if(pileupread.alignment.query_name in readNameHash): - #print pileupread.alignment.query_name old_qual = readNameHash[pileupread.alignment.query_name][0] old_base = readNameHash[pileupread.alignment.query_name][1] old_is_reverse = readNameHash[pileupread.alignment.query_name][2] @@ -303,9 +311,10 @@ def performAnalysis(args): remove_base = current_base remove_is_reverse = current_is_reverse remove_old = False - - count_PE.update(remove_base == REF) - (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) + + count_PE.update(remove_base) + (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) + # If current base is better, then removing the information about old mate # If current base is not good, then do nothing if remove_old: @@ -382,8 +391,7 @@ def performAnalysis(args): break # only one pileup for a position # Calculating duplicates based on read-mate pair's start positions (chr id and start location) - count_duplicate = BoolCounter() - + count_duplicate = BoolCounter(REF, ALT) for key in readMateHash: value_length = len(readMateHash[key]) if value_length > 0: @@ -394,20 +402,26 @@ def performAnalysis(args): remove_base = decreaseInfo[0] remove_is_reverse = decreaseInfo[1] - count_duplicate.update(remove_base == REF) + count_duplicate.update(remove_base) (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd + + if (DP4[2] + DP4[3]) > ALTcount: # that the ALTcount is larger happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling + if DP4af >= nonREFnonALTfwd: + DP4af -= nonREFnonALTfwd + count_nonREFnonALT.update_nonREFnonALT(nonREFnonALTfwd) + + if DP4ar >= nonREFnonALTrev: + DP4ar -= nonREFnonALTrev + count_nonREFnonALT.update_nonREFnonALT(nonREFnonALTrev) + #Format: DP2 -> "reference(forward + reverse), alt(forward + reverse)" supple_dup_str = 'DP2sup=' + ','.join(map(str, count_supple.counted())) - supple_dup_str += 'DP2dup=' + ','.join(map(str, count_duplicate.counted())) + supple_dup_str += ';DP2dup=' + ','.join(map(str, count_duplicate.counted())) supple_dup_str += ';DP2pairEnd=' + ','.join(map(str, count_PE.counted())) supple_dup_str += ';DP2mis=' + ','.join(map(str, count_mismatch.counted())) - supple_dup_str += ';DPnonREFnonALT=' + str(nonREFnonALT) - - if (DP4[2] + DP4[3]) > ALTcount: # that the ALTcount is larger happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling - if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd - if DP4ar >= nonREFnonALTrev: DP4ar -= nonREFnonALTrev + supple_dup_str += ';DP2nonREFnonALT=' + ','.join(map(str, count_nonREFnonALT.counted())) ACGTNacgtn1_string = "ACGTNacgtnPLUS="+",".join([str(i) for i in ACGTNacgtn1]) ACGTNacgtn2_string = "ACGTNacgtnMINUS="+",".join([str(i) for i in ACGTNacgtn2]) From 674900f419d71119837bbf5cc0355d4463b6d386 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 11 Feb 2021 13:33:46 +0100 Subject: [PATCH 08/49] updating pysam to 0.16.0.1 --- .../analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh index 69338b1..e1aeb9d 100755 --- a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh +++ b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh @@ -19,7 +19,7 @@ module load bedtools/"${BEDTOOLS_VERSION:?BEDTOOLS_VERSION undefined}" module load pypy/"${PYPY_VERSION:?PYPY_VERSION undefined}" set +ue -source /odcf/cluster/virtualenvs/paramasi/python_2.7.9_SNVCalling_pysam_0.15.2/bin/activate +source /dkfz/cluster/virtualenvs/paramasi/python_2.7.9_SNVCalling_pysam_0.16.0.1/bin/activate set -ue export BGZIP_BINARY=bgzip From 128c1c0bc0ea39f2231f74f5c0567bf518527043 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 11 Feb 2021 13:34:23 +0100 Subject: [PATCH 09/49] Moving files to ngs_share --- .../configurationFiles/analysisSNVCalling.xml | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index bff9649..3afbcc1 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -49,15 +49,18 @@ + + + - + - - + + @@ -128,14 +131,9 @@ - - + - - - - @@ -152,13 +150,9 @@ # if any whitespaces are required, use quotation marks like in this example: #MAPABILITY="/ibios/co02/annotation/hg19/wgEncodeCrgMapabilityAlign100mer.bedGraph.gz:::-breakPointMode -aEndOffset=1"--> - - - - From 2de0727d2e90798cd017bfd5f40fee5935f7b844 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 11 Mar 2021 10:27:11 +0100 Subject: [PATCH 10/49] Updating BoolCounter class --- .../snvPipeline/filter_PEoverlap.py | 46 +++++++------------ 1 file changed, 17 insertions(+), 29 deletions(-) diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py index d6f6a97..f0cabc3 100755 --- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py +++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py @@ -96,20 +96,20 @@ def decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, D class BoolCounter: """ A class for counter objects """ - def __init__(self, ref, alt): - self.REF = ref - self.ALT = alt + def __init__(self, ref_base, alt_base): + self.ref_base = ref_base + self.alt_base = alt_base self.ref_count = 0 self.alt_count = 0 def update(self, current_base): - if self.REF == current_base: + if self.ref_base == current_base: self.ref_count += 1 - elif self.ALT == current_base: + elif self.alt_base == current_base: self.alt_count += 1 - def update_nonREFnonALT(self, remove_count): - self.alt_count =+ remove_count + def update_nonREFnonALT(self, count): + self.alt_count =+ count def counted(self): return [self.ref_count, self.alt_count] @@ -132,13 +132,12 @@ def performAnalysis(args): reference_file = args.refFileName mode = "r" - if args.alignmentFile[-4:] == ".bam": - mode += "b" - multiple_iterators = False + multiple_iterators = False + if args.alignmentFile.split(".")[-1] == "bam": + mode += "b" samfile = pysam.Samfile(args.alignmentFile, mode) - elif args.alignmentFile[-5:] == ".cram": - mode += "c" - multiple_iterators = False + elif args.alignmentFile.split(".")[-1] == "cram": + mode += "c" samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file) if args.altPosF != '': @@ -274,14 +273,6 @@ def performAnalysis(args): ACGTNacgtn1[ACGTNacgtn_index[1]] += 1 else: ACGTNacgtn2[ACGTNacgtn_index[1]] += 1 - - # Supplementary reads are removed with 'flag_filter=3844' argument in samfile.pileup() above, so this block is commented out. - #if(pileupread.alignment.is_supplementary): - # remove_base = pileupread.alignment.seq[pileupread.query_position] - # remove_is_reverse = pileupread.alignment.is_reverse - # count_supple.update(remove_base == REF) - # (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) - # continue #if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: # DEBUG July 23 2012: BROAD BAM problem due to pileupread.alignment.qqual being shorter sometimes than pileupread.alignment.qual @@ -405,16 +396,13 @@ def performAnalysis(args): count_duplicate.update(remove_base) (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) - nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd - if (DP4[2] + DP4[3]) > ALTcount: # that the ALTcount is larger happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling - if DP4af >= nonREFnonALTfwd: - DP4af -= nonREFnonALTfwd - count_nonREFnonALT.update_nonREFnonALT(nonREFnonALTfwd) + if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd - if DP4ar >= nonREFnonALTrev: - DP4ar -= nonREFnonALTrev - count_nonREFnonALT.update_nonREFnonALT(nonREFnonALTrev) + if DP4ar >= nonREFnonALTrev: DP4ar -= nonREFnonALTrev + + nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd + count_nonREFnonALT.update_nonREFnonALT(nonREFnonALT) #Format: DP2 -> "reference(forward + reverse), alt(forward + reverse)" supple_dup_str = 'DP2sup=' + ','.join(map(str, count_supple.counted())) From d8f3cf166c553992b37d3b63458ad381bbe876b8 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 11 Mar 2021 10:31:42 +0100 Subject: [PATCH 11/49] REF name via BAM header --- resources/analysisTools/snvPipeline/filter_vcf.sh | 4 ++-- resources/analysisTools/snvPipeline/snvCalling.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh index 1d57f0e..dc71e1e 100755 --- a/resources/analysisTools/snvPipeline/filter_vcf.sh +++ b/resources/analysisTools/snvPipeline/filter_vcf.sh @@ -19,8 +19,8 @@ mpileupDirectory=`dirname ${FILENAME_VCF}` RUN_PLOTS=${RUN_PLOTS-1} RUN_PUREST=${RUN_PUREST-1} -#source ${TOOL_ANALYZE_BAM_HEADER} -#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME +source ${TOOL_ANALYZE_BAM_HEADER} +getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME ALIGNMENT_FOLDER=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}` # bugfix: ensure to interpret CHROMOSOME_INDICES as array - otherwise TOOL_INTERMUTATION_DISTANCE_COORD_COLOR will fail... diff --git a/resources/analysisTools/snvPipeline/snvCalling.sh b/resources/analysisTools/snvPipeline/snvCalling.sh index 92f1113..de88566 100755 --- a/resources/analysisTools/snvPipeline/snvCalling.sh +++ b/resources/analysisTools/snvPipeline/snvCalling.sh @@ -24,8 +24,8 @@ fi PARM_CHR_INDEX=${PARM_CHR_INDEX-} # Ignore chr prefix and set it manually -#source ${TOOL_ANALYZE_BAM_HEADER} -#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME +source ${TOOL_ANALYZE_BAM_HEADER} +getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME chr="" if [[ -n "$PARM_CHR_INDEX" ]] From b99be8f5a627b4e3c447262db17f2d6e7171997e Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 11 Mar 2021 10:34:28 +0100 Subject: [PATCH 12/49] Reverting generic xml to hg19 --- .../configurationFiles/analysisSNVCalling.xml | 70 ++++++++++--------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index 3afbcc1..6049fab 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -39,34 +39,32 @@ - + - - - - - - - - - - - + + + + + + + - + - + - - - + + + + + - - - - + + + + @@ -93,7 +91,7 @@ - + - - - + + + + + + + @@ -149,13 +151,17 @@ # see also the usage report of ngs2/trunk/tools/annotate_vcf.pl # if any whitespaces are required, use quotation marks like in this example: #MAPABILITY="/ibios/co02/annotation/hg19/wgEncodeCrgMapabilityAlign100mer.bedGraph.gz:::-breakPointMode -aEndOffset=1"--> - - - - - - - + + + + + + + + + + + From 98bb76e568da9108ddf6759786ab3f532e19dfa1 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 11 Mar 2021 10:35:23 +0100 Subject: [PATCH 13/49] New xml for GRCh38 files --- .../analysisSNVCallingGRCh38.xml | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 resources/configurationFiles/analysisSNVCallingGRCh38.xml diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml new file mode 100644 index 0000000..77711f8 --- /dev/null +++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml @@ -0,0 +1,68 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file From 90b99785600907f1e809bc6661d487cf8b3bf66f Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 11 Mar 2021 10:36:18 +0100 Subject: [PATCH 14/49] chrLength file with 'chr' prefix --- .../snvPipeline/intermutationDistance_Coord_color.r | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r b/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r index e00938f..67ec8b3 100755 --- a/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r +++ b/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r @@ -45,6 +45,11 @@ c.array <- unlist(strsplit(opt$chrArray, ',', fixed = TRUE)) data <- read.table(opt$chrLengthFile, header=FALSE) chrLength <- data.frame(data) rownames(chrLength) <- chrLength$V1 + +if((opt$chrPrefix != "" && grepl(opt$chrPrefix, chrLength$V1[1])) && !grepl(opt$chrPrefix, c.array[1])){ + c.array <- paste0(opt$chrPrefix, c.array, opt$chrSuffix) +} + xtotal <- sum(chrLength[c.array,2]/10) xoffsets <- c(0) @@ -84,7 +89,7 @@ pdf(opt$outFilePrefix, width=20, height=8) plotColors = c("CA"="blue","CG"="black","CT"="red","TA"="purple","TC"="orange","TG"="green") baseChanges <- c("CA", "CG", "CT", "TA", "TC","TG") -c.name <- paste0(opt$chrPrefix, c.array[1], opt$chrSuffix) +c.name <- c.array[1] sel <- which(dat$diff > 0) if(length(sel)){ @@ -101,7 +106,7 @@ if(length(sel)){ # chromosomes <- c(2:22, 'X') for (chr in c.array) { - c.name <- paste0(opt$chrPrefix, chr, opt$chrSuffix) + c.name <- chr sel <- which(dat$X.CHROM == c.name & dat$diff > 0) points((dat$POS[sel]/10+xoffset)/xtotal, log10(dat$diff[sel]), col = plotColors[dat$change[sel]], pch=16,cex=0.8) From 58b77cd1dd925d1ad8fe65c4e74de9c0d0300f98 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Tue, 16 Mar 2021 09:55:28 +0100 Subject: [PATCH 15/49] Removing hard-coded header parsing --- .../snvPipeline/in_dbSNPcounter.pl | 53 +++++-------------- .../snvPipeline/snv_extractor_v1.pl | 15 +++--- 2 files changed, 21 insertions(+), 47 deletions(-) diff --git a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl index cbf0ef3..737ae04 100755 --- a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl +++ b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl @@ -43,49 +43,24 @@ } chomp $header; -my $DBSBP = 13; -my $KG = 14; -my $CONFIDENCE = 28; -my $CANNOTATION = -1; -my $RECLASSIFICATION = -1; -my $CLASSIFICATION; +my @head=split("\t", $header); +my %col; -@help = split(/\t/, $header); -for (my $c = 0; $c < @help; $c++) +my $i = 0; +while($i < @head) { - # fixed column labels (from vcf_pileup_compare) - if ($help[$c] eq "CONFIDENCE") - { - $CONFIDENCE = $c; - print STDERR "CONFIDENCE in column $c\n"; - } - if ($help[$c] eq "1K_GENOMES") - { - $KG = $c; - print STDERR "INFO in column $c\n"; - } - if ($help[$c] eq "DBSNP") - { - $DBSBP = $c; - print STDERR "DBSNP_COL in column $c\n"; - } - if ($help[$c] eq "ANNOTATION_control") - { - $CANNOTATION = $c; - print STDERR "ANNOTATION_control in column $c\n"; - } - if ($help[$c] eq "RECLASSIFICATION") - { - $RECLASSIFICATION = $c; - print STDERR "RECLASSIFICATION in column $c\n"; - } + if($head[$i] eq "DBSNP"){$col{"DBSNP"} = $i; print STDERR "DBSNP in column ", $i+1,"\n";} + if($head[$i] eq "1K_GENOMES"){$col{"1K_GENOMES"} = $i; print STDERR "1K_GENOMES in column ", $i+1,"\n";} + if($head[$i] eq "RECLASSIFICATION"){$col{"RECLASSIFICATION"} = $i; print STDERR "RECLASSIFICATION in column ", $i+1,"\n";} + if($head[$i] eq "ANNOTATION_control"){$col{"ANNOTATION_control"} = $i; print STDERR "ANNOTATION_control in column ", $i+1,"\n";} + if($head[$i] eq "CONFIDENCE"){$col{"CONFIDENCE"} = $i; print STDERR "CONFIDENCE in column ", $i+1,"\n";} + $i++; } -if ($CANNOTATION > 0) { - $CLASSIFICATION = $CANNOTATION; -} else { - $CLASSIFICATION = $RECLASSIFICATION; -} +my $DBSBP = $col{"DBSNP"}; +my $KG = $col{"1K_GENOMES"}; +my $CONFIDENCE = $col{"CONFIDENCE"}; +my $CLASSIFICATION = $col{"RECLASSIFICATION"}; # I know that it's evilly hardcoded! while () diff --git a/resources/analysisTools/snvPipeline/snv_extractor_v1.pl b/resources/analysisTools/snvPipeline/snv_extractor_v1.pl index cea35fa..d880b2e 100755 --- a/resources/analysisTools/snvPipeline/snv_extractor_v1.pl +++ b/resources/analysisTools/snvPipeline/snv_extractor_v1.pl @@ -64,18 +64,17 @@ print STDERR "NOTE: The reading of the VCF will end with a broken pipe error, because we only read the header and then stop.\n"; close IN; -my @head=split("\t", $head); +my @header=split("\t", $head); my %col; - my $i = 0; -while($i < @head) +while($i < @header) { - if($head[$i] eq "EXONIC_CLASSIFICATION"){$col{"EXONIC_CLASSIFICATION"} = $i; print "EXONIC_CLASSIFICATION in column ", $i+1,"\n";} - if($head[$i] eq "ANNOVAR_FUNCTION"){$col{"ANNOVAR_FUNCTION"} = $i; print "ANNOVAR_FUNCTION in column ", $i+1,"\n";} - if($head[$i] eq "RECLASSIFICATION"){$col{"RECLASSIFICATION"} = $i; print "RECLASSIFICATION in column ", $i+1,"\n";} - if($head[$i] eq "ANNOTATION_control"){$col{"ANNOTATION_control"} = $i; print "ANNOTATION_control in column ", $i+1,"\n";} - if($head[$i] eq "CONFIDENCE"){$col{"CONFIDENCE"} = $i; print "CONFIDENCE in column ", $i+1,"\n";} + if($header[$i] eq "EXONIC_CLASSIFICATION"){$col{"EXONIC_CLASSIFICATION"} = $i; print "EXONIC_CLASSIFICATION in column ", $i+1,"\n";} + if($header[$i] eq "ANNOVAR_FUNCTION"){$col{"ANNOVAR_FUNCTION"} = $i; print "ANNOVAR_FUNCTION in column ", $i+1,"\n";} + if($header[$i] eq "RECLASSIFICATION"){$col{"RECLASSIFICATION"} = $i; print "RECLASSIFICATION in column ", $i+1,"\n";} + if($header[$i] eq "ANNOTATION_control"){$col{"ANNOTATION_control"} = $i; print "ANNOTATION_control in column ", $i+1,"\n";} + if($header[$i] eq "CONFIDENCE"){$col{"CONFIDENCE"} = $i; print "CONFIDENCE in column ", $i+1,"\n";} $i++; } From 1af4b686b575e8c2230ed09bebadec6355c92e77 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Tue, 16 Mar 2021 10:01:09 +0100 Subject: [PATCH 16/49] Liftover local control for hg38 WES and WGS --- .../snvPipeline/confidenceAnnotation_SNVs.py | 21 ++++++++++++------- .../analysisTools/snvPipeline/filter_vcf.sh | 3 ++- .../snvPipeline/snvAnnotation.sh | 7 ++++--- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index 6c8d22c..88e09f3 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -107,7 +107,8 @@ def main(args): if args.no_control: variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$" variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$" - variable_headers["LOCALCONTROL_COL"] = "^LocalControlAF$" + variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$" + variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$" else: fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ] @@ -187,7 +188,8 @@ def main(args): classification = "somatic" # start with default somatic inGnomAD_WES = False inGnomAD_WGS = False - inLocalControl = False + inLocalControl_WES = False + inLocalControl_WGS = False else: # for potential re-classification (e.g. low coverage in control and in dbSNP => probably germline) classification = help["ANNOTATION_control"] # start with original classification @@ -239,10 +241,15 @@ def main(args): infofield["gnomAD_Genomes"] = "gnomAD_Genomes" reasons += "gnomAD_Genomes(NoControl)" - if help["LOCALCONTROL_COL_VALID"] and any(af > 0.02 for af in map(float, extract_info(help["LOCALCONTROL_COL"], "AF").split(','))): - inLocalControl = True - infofield["LocalControl"] = "LocalControl" - reasons += "LocalControl(NoControl)" + if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))): + inLocalControl_WGS = True + infofield["LocalControl_WGS"] = "LocalControl_WGS" + reasons += "LocalControl_WGS(NoControl)" + if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))): + inLocalControl_WES = True + infofield["LocalControl_WES"] = "LocalControl_WES" + reasons += "LocalControl_WES(NoControl)" + # Punish for biases round 1 if idx_pcrbias != -1 and idx_seqbias != -1 and args.round == 1: @@ -391,7 +398,7 @@ def main(args): if (args.no_control): # an SNV that is in dbSNP but not "clinic" or/and in 1 KG with high frequency is probably germline - if (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl): + if (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS or inLocalControl_WES): classification = "SNP_support_germline" # 3) information from the calls and germline comparisons: coverage, strand bias, variant support, .. diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh index dc71e1e..0e8f226 100755 --- a/resources/analysisTools/snvPipeline/filter_vcf.sh +++ b/resources/analysisTools/snvPipeline/filter_vcf.sh @@ -40,7 +40,8 @@ if [[ ${isNoControlWorkflow-false} == "true" ]]; then #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AFR_AF ${CRIT_1KGENOMES_maxMAF}+" [[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} EUR_AF ${CRIT_1KGENOMES_maxMAF}+" [[ ${FILTER_NON_CLINIC} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${DBSNP_COL} CLN,COMMON nonexist,exist" - [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+" + [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WGS_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+" + [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WES_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+" [[ ${FILTER_RECURRENCE} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${RECURRENCE_COL} . ${CRIT_RECURRENCE}+" if [[ ${FILTER_VALUES} != "" ]]; then diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh index 4400766..575e50e 100755 --- a/resources/analysisTools/snvPipeline/snvAnnotation.sh +++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh @@ -91,9 +91,10 @@ fi cmdFilter="${cmdFilter} | ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${DBSNP} --columnName=${DBSNP_COL} --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${KGENOME} --columnName=${KGENOMES_COL} --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_ALL_SNV} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_ALL_SNV} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL} --columnName ${LOCALCONTROL_COL} --bFileType vcf --reportMatchType --reportLevel 4 " + ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${GNOMAD_WES_ALL_SNV} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ + ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${GNOMAD_WGS_ALL_SNV} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ + ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL_WGS} --columnName ${LOCALCONTROL_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ + ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL_WES} --columnName ${LOCALCONTROL_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 " if [[ -f ${RECURRENCE} ]]; then cmdFilter="${cmdFilter} | ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${RECURRENCE} --columnName ${RECURRENCE_COL} --bFileType vcf" From baabd823301d9ed32f545b2e705ba8f22e86d1f7 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Tue, 16 Mar 2021 10:03:25 +0100 Subject: [PATCH 17/49] hg19 specific annotations --- .../snvPipeline/confidenceAnnotation_SNVs.py | 53 +++++++++++-------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index 88e09f3..4ffe53d 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -32,7 +32,15 @@ def extract_info(info, keys, sep=";"): rtn = '0' if rtn == "None" else rtn return rtn +def validate_refgenome(refname): + valid_refgenome = ('hs37d5', 'GRCh38') + if refname not in valid_refgenome: + raise ValueError('Reference name (--refgenome) is not valid: %s. Valid reference genome names are %s' % (refname, ', '.join(valid_refgenome))) + def main(args): + + validate_refgenome(args.refgenome[0]) + if args.pancanout is not None and not args.no_makehead: header = '##fileformat=VCFv4.1\n' \ '##fileDate=' + time.strftime("%Y%m%d") + '\n' \ @@ -102,6 +110,13 @@ def main(args): "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$", "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$", "^seqBiasPresent_2$", "^seqingBiasPresent_2$"] + + if args.refgenome[0] == "hs37d5": + fixed_headers = ["^INFO$", "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", "DUKE_EXCLUDED", + "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$", + "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$", + "^seqBiasPresent_2$", "^seqingBiasPresent_2$"] + variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", } if args.no_control: @@ -327,27 +342,23 @@ def main(args): reasons += "Segmental_dup(-2)" filterfield["RE"] = 1 - # Encode blacklist, generated for hg38 mappings -# if help["ENCODE_BLACKLIST_VALID"]: -# confidence -= 3 # High signal region and low mappability -# is_weird = True -# reasons += "Blacklist(-3)" -# filterfield["BL"] = 1 - - # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat - #if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]: - # confidence -= 3 # really bad region, usually centromeric repeats - # is_weird = True - # reasons += "Blacklist(-3)" - # filterfield["BL"] = 1 - - # HiSeqDepth: regions "attracting" reads; often coincide with tandem repeats and CEN/TEL, - # not always with low mapability - #if help["HISEQDEPTH_VALID"]: - # confidence -= 3 # really really bad region! - # is_weird = True - # reasons += "Hiseqdepth(-3)" - # filterfield["HSDEPTH"] = 1 + # Only for hg19 reference genome + if args.refgenome[0] == "hs37d5": + + # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat + if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]: + confidence -= 3 # really bad region, usually centromeric repeats + is_weird = True + reasons += "Blacklist(-3)" + filterfield["BL"] = 1 + + # HiSeqDepth: regions "attracting" reads; often coincide with tandem repeats and CEN/TEL, + # not always with low mapability + if help["HISEQDEPTH_VALID"]: + confidence -= 3 # really really bad region! + is_weird = True + reasons += "Hiseqdepth(-3)" + filterfield["HSDEPTH"] = 1 # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ... # Everything with really high number of occurences is artefacts From 5a764ef4c765aafddeb2f2d28490d50a4ab4a9e1 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Tue, 16 Mar 2021 10:06:35 +0100 Subject: [PATCH 18/49] User defined threshold-based 'SNP_support_germline' annotations --- .../snvPipeline/confidenceAnnotation_SNVs.py | 11 ++++++++--- .../analysisTools/snvPipeline/snvAnnotation.sh | 17 +++++++++++------ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index 4ffe53d..dd2c281 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -225,7 +225,7 @@ def main(args): in1KG = True if args.no_control: af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF") - if af is not None and any(af > 0.01 for af in map(float, af.split(','))) > 0.01: + if af is not None and any(af > args.kgenome_maxMAF for af in map(float, af.split(','))): in1KG_AF = True infofield["1000G"] = "1000G" # dbSNP @@ -247,11 +247,11 @@ def main(args): if args.no_control: if indbSNP and is_commonSNP and not is_clinic: reasons += "dbSNP(NoControl)" - if help["GNOMAD_EXOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): + if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): inGnomAD_WES = True infofield["gnomAD_Exomes"] = "gnomAD_Exomes" reasons += "gnomAD_Exomes(NoControl)" - if help["GNOMAD_GENOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))): + if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))): inGnomAD_WGS = True infofield["gnomAD_Genomes"] = "gnomAD_Genomes" reasons += "gnomAD_Genomes(NoControl)" @@ -800,5 +800,10 @@ def main(args): parser.add_argument("-x", "--runexome", dest="runexome", action="store_true", default=False, help="Run on exome, will turn off the high control coverage punishment " \ "and the PCR bias filter.") + parser.add_argument("--gnomAD_WGS_maxMAF", dest="gnomAD_WGS_maxMAF", help="Max gnomAD WGS MAF", default=0.001, type=float) + parser.add_argument("--gnomAD_WES_maxMAF", dest="gnomAD_WES_maxMAF", help="Max gnomAD WES MAF", default=0.001, type=float) + parser.add_argument("--localControl_WGS_maxMAF", dest="localControl_WGS_maxMAF", help="Max local control WGS MAF", default=0.01, type=float) + parser.add_argument("--localControl_WES_maxMAF", dest="localControl_WES_maxMAF", help="Max local control WES MAF", default=0.01, type=float) + parser.add_argument("--1000genome_maxMAF", dest="kgenome_maxMAF", help="Max 1000 genome MAF", default=0.01, type=float) args = parser.parse_args() main(args) diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh index 575e50e..713a5bc 100755 --- a/resources/analysisTools/snvPipeline/snvAnnotation.sh +++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh @@ -197,7 +197,8 @@ mkfifo ${npConfidence} if [[ ${isNoControlWorkflow-false} == "false" ]]; then cat ${filenameSNVVCF} > ${npConfidence} & else - cat ${filenameSNVVCF} | ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 > ${npConfidence} & + cat ${filenameSNVVCF} | ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${npConfidence} & fi # create BaseScore FIFOs and their consumer processes (zip and write to target file) @@ -212,14 +213,15 @@ cat ${filenameReferenceAlleleReadPositions}_NP | ${BGZIP_BINARY} -f >${filenameR if [[ ${runArtifactFilter-true} == true ]] then cat ${npConfidence} | filterPeOverlap ${noControlFlag} --alignmentFile=${tumorbamfullpath} --mapq=$mapqual --baseq=$basequal --qualityScore=phred --maxNumberOfMismatchesInRead=${NUMBER_OF_MISMACTHES_THRESHOLD--1} --altBaseQualFile=${filenameAlternativeAlleleBaseScores}_NP --refBaseQualFile=${filenameReferenceAlleleBaseScores}_NP --altBasePositionsFile=${filenameAlternativeAlleleReadPositions}_NP --refBasePositionsFile ${filenameReferenceAlleleReadPositions}_NP --referenceFile=${REFERENCE_GENOME} \ - | ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 -f ${filenameSomaticSNVsTmp} > ${filenameSNVVCFTemp}.tmp + | ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 -f ${filenameSomaticSNVsTmp} \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCFTemp}.tmp [[ $? != 0 ]] && echo "Error in first iteration of confidence annotation" && exit 2 NRSOMSNV=`grep -v "^#" ${filenameSomaticSNVsTmp} | wc -l` echo -e "SOMATIC_SNVS_UNFILTERED\t${NRSOMSNV}">> ${filenameQCValues} - mv ${filenameSNVVCFTemp}.tmp ${filenameSNVVCFTemp} + mv ${filenameSNVVCFTemp}.tmp ${filenameSNVVCFTemp} wait ${zipAlternativeAlleleBaseScores} ; [[ $? -gt 0 ]] && echo "Error from zipAlternativeAlleleBaseScores" && exit 31 wait ${zipReferenceAlleleBaseScores} ; [[ $? -gt 0 ]] && echo "Error from zipReferenceAlleleBaseScores" && exit 32 @@ -241,7 +243,8 @@ then fi cat ${filenameSNVVCFTemp} | ${PYTHON_BINARY} ${TOOL_FLAG_BIAS} --vcfFile="/dev/stdin" --referenceFile=${REFERENCE_GENOME} --sequence_specificFile=${filenamePCRerrorMatrix} --sequencing_specificFile=${filenameSequencingErrorMatrix} --numReads=${nReads} --numMuts=${nMuts} --biasPValThreshold=${biasPValThreshold} --biasRatioThreshold=${biasRatioThreshold} --biasRatioMinimum=${biasRatioMinimum} --maxNumOppositeReadsSequencingWeakBias=${maxNumOppositeReadsSequencingWeakBias} --maxNumOppositeReadsSequenceWeakBias=${maxNumOppositeReadsSequenceWeakBias} --maxNumOppositeReadsSequencingStrongBias=${maxNumOppositeReadsSequencingStrongBias} --maxNumOppositeReadsSequenceStrongBias=${maxNumOppositeReadsSequenceStrongBias} --ratioVcf=${rVcf} --bias_matrixSeqFile=${filenameBiasMatrixSeqFile} --bias_matrixSeqingFile=${filenameBiasMatrixSeqingFile} --vcfFileFlagged="/dev/stdout" | \ - ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 1 -f ${filenameSomaticSNVsTmp} > ${filenameSNVVCFTemp}.tmp + ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 1 -f ${filenameSomaticSNVsTmp} \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCFTemp}.tmp [[ $? != 0 ]] && echo "Error in first filtering and/or second interation of confidence annotation" && exit 5 @@ -265,7 +268,8 @@ then fi cat ${filenameSNVVCFTemp} | ${PYTHON_BINARY} -u ${TOOL_FLAG_BIAS} --vcfFile="/dev/stdin" --referenceFile=${REFERENCE_GENOME} --sequence_specificFile=${filenamePCRerrorMatrix} --sequencing_specificFile=${filenameSequencingErrorMatrix} --numReads=${nReads} --numMuts=${nMuts} --biasPValThreshold=${biasPValThreshold} --biasRatioThreshold=${biasRatioThreshold} --biasRatioMinimum=${biasRatioMinimum} --maxNumOppositeReadsSequencingWeakBias=${maxNumOppositeReadsSequencingWeakBias} --maxNumOppositeReadsSequenceWeakBias=${maxNumOppositeReadsSequenceWeakBias} --maxNumOppositeReadsSequencingStrongBias=${maxNumOppositeReadsSequencingStrongBias} --maxNumOppositeReadsSequenceStrongBias=${maxNumOppositeReadsSequenceStrongBias} --ratioVcf=${rVcf} --bias_matrixSeqFile=${filenameBiasMatrixSeqFile} --bias_matrixSeqingFile=${filenameBiasMatrixSeqingFile} --vcfFileFlagged="/dev/stdout" | \ - ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} -a 2 > ${filenameSNVVCF} + ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} -a 2 \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCF} [[ $? != 0 ]] && echo "Error in second filtering and/or third iteration of confidence annotation" && exit 8 @@ -287,7 +291,8 @@ then fi else cat ${npConfidence} | filterPeOverlap ${noControlFlag} --alignmentFile=${tumorbamfullpath} --mapq=$mapqual --baseq=$basequal --qualityScore=phred --maxNumberOfMismatchesInRead=${NUMBER_OF_MISMACTHES_THRESHOLD--1} --altBaseQualFile=${filenameAlternativeAlleleBaseScores}_NP --refBaseQualFile=${filenameReferenceAlleleBaseScores}_NP --altBasePositionsFile=${filenameAlternativeAlleleReadPositions}_NP --refBasePositionsFile=${filenameReferenceAlleleReadPositions}_NP --referenceFile=${REFERENCE_GENOME} | \ - ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} > ${filenameSNVVCFTemp} + ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCFTemp} exitCode=$? [[ $exitCode == 0 ]] && [[ -f ${filenameSNVVCFTemp} ]] && mv ${filenameSNVVCFTemp} ${filenameSNVVCF} From da889407fd8d967d54549759670575a9ac7548af Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Tue, 16 Mar 2021 10:36:17 +0100 Subject: [PATCH 19/49] Removed extra spaces --- buildversion.txt | 2 +- .../snvPipeline/confidenceAnnotation_SNVs.py | 4 +- .../snvPipeline/filter_PEoverlap.py | 72 +++++++++---------- .../snvPipeline/snvAnnotation.sh | 2 +- .../configurationFiles/analysisSNVCalling.xml | 10 +-- .../analysisSNVCallingGRCh38.xml | 14 ++-- 6 files changed, 52 insertions(+), 52 deletions(-) diff --git a/buildversion.txt b/buildversion.txt index e94ba2a..228b97b 100755 --- a/buildversion.txt +++ b/buildversion.txt @@ -1,2 +1,2 @@ -2.0 +2.1 0 diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index dd2c281..22be339 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -110,7 +110,7 @@ def main(args): "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$", "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$", "^seqBiasPresent_2$", "^seqingBiasPresent_2$"] - + if args.refgenome[0] == "hs37d5": fixed_headers = ["^INFO$", "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", "DUKE_EXCLUDED", "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$", @@ -123,7 +123,7 @@ def main(args): variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$" variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$" variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$" - variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$" + variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$" else: fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ] diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py index f0cabc3..9b7fd57 100755 --- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py +++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py @@ -134,11 +134,11 @@ def performAnalysis(args): mode = "r" multiple_iterators = False if args.alignmentFile.split(".")[-1] == "bam": - mode += "b" + mode += "b" samfile = pysam.Samfile(args.alignmentFile, mode) elif args.alignmentFile.split(".")[-1] == "cram": - mode += "c" - samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file) + mode += "c" + samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file) if args.altPosF != '': ALT_basePositions_file = args.altPosF @@ -177,7 +177,7 @@ def performAnalysis(args): REF_baseQualities=[] ALT_baseQualities=[] - # how to treat multiallelic SNVs? Skipped in this current version... + # how to treat multiallelic SNVs? Skipped in this current version... if ((args.no_control and int(parsed_line["CONFIDENCE"]) > 7 and "somatic" in parsed_line["RECLASSIFICATION"]) or (not args.no_control and "somatic" in parsed_line["ANNOTATION_control"])) and len(parsed_line["ALT"]) == 1: # DP=13;AF1=0.5;AC1=1;DP4=2,3,3,4;MQ=37;FQ=75;PV4=1,1,1,1 info_values = parsed_line["INFO"].split(';') @@ -187,14 +187,14 @@ def performAnalysis(args): DP4 = map(int, info_value[4:].split(',')) DP4rf, DP4rr, DP4af, DP4ar = DP4 DP4_original = re.sub('DP4', 'DP4original', info_value) # Keeping a backup of original DP4 - DP4_original_alt = DP4af + DP4ar + DP4_original_alt = DP4af + DP4ar break chrom=parsed_line["CHROM"] pos=int(parsed_line["POS"]) REF=parsed_line["REF"] ALT=parsed_line["ALT"] - + readNameHash={} readMateHash={} # Hash to store read and mate starting positions for duplicate marking readMateHash_qnameLocation={} # Hash to store the location of gname in the above hash list @@ -202,22 +202,22 @@ def performAnalysis(args): ACGTNacgtn1 = [0]*10 ACGTNacgtn2 = [0]*10 - count_PE = BoolCounter(REF, ALT) # Starting the counter for the forward and reverse reads removed due to PE overlap detection + count_PE = BoolCounter(REF, ALT) # Starting the counter for the forward and reverse reads removed due to PE overlap detection count_supple = BoolCounter(REF, ALT) # "" for supplementary reads, since flag_filter is added, entire supplementary detection can be removed in future versions count_mismatch = BoolCounter(REF, ALT) # " for mismatch report count_nonREFnonALT = BoolCounter(REF, ALT) # " for nonREF and nonALT bases # To match pysam and mpileup counts, a reference file is added. Given the reference file, Pysam by default computes BAQ (compute_baq). for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=3844, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators): - if pileupcolumn.pos == (pos-1): - #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments) - for pileupread in pileupcolumn.pileups: - if pileupread.is_del: - # 31 May 2016 JB: deletion at the pileup position - continue + if pileupcolumn.pos == (pos-1): + #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments) + for pileupread in pileupcolumn.pileups: + if pileupread.is_del: + # 31 May 2016 JB: deletion at the pileup position + continue baseScore = transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] readpos = pileupread.query_position - if pileupread.alignment.seq[pileupread.query_position].lower() == ALT.lower(): + if pileupread.alignment.seq[pileupread.query_position].lower() == ALT.lower(): if args.altBQF != '': ALT_baseQualities.append(baseScore) if args.altPosF != '': @@ -237,7 +237,7 @@ def performAnalysis(args): if pileupread.alignment.mapq >= args.mapq: # http://wwwfgu.anat.ox.ac.uk/~andreas/documentation/samtools/api.html USE qqual try: - if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: + if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: # check if we consider this read as a proper read in terms of number of mismatches if args.allowedNumberOfMismatches > -1: numberOfMismatches = None @@ -246,14 +246,14 @@ def performAnalysis(args): numberOfMismatches = tag[1] break else: - continue - + continue + if numberOfMismatches is not None: if numberOfMismatches > args.allowedNumberOfMismatches: remove_base = pileupread.alignment.seq[pileupread.query_position] remove_is_reverse = pileupread.alignment.is_reverse count_mismatch.update(remove_base) - (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) + (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) # after decreasing the respective DP4 value, go directly to the next read # without remembering the current read # This will lead to an unknown read name when the paired read occurs at the same @@ -261,10 +261,10 @@ def performAnalysis(args): # have to decrease DP4 values again, when the read partner occurs at the same SNV. # We also do not increase ANCGTNacgtn for the discarded read. continue - + # Check if pileupread.alignment is proper pair if(pileupread.alignment.is_proper_pair): - # count to ACGTNacgtn list + # count to ACGTNacgtn list is_reverse = pileupread.alignment.is_reverse is_read1 = pileupread.alignment.is_read1 base = pileupread.alignment.seq[pileupread.query_position].lower() @@ -332,17 +332,17 @@ def performAnalysis(args): readMateHash[read_mate_tuple] = [] readMateHash[read_mate_tuple].append(read_mate_tuple_value) - readMateHash_qnameLocation[pileupread.alignment.query_name] = len(readMateHash[read_mate_tuple]) - 1 # Location of the last pushed element in the array + readMateHash_qnameLocation[pileupread.alignment.query_name] = len(readMateHash[read_mate_tuple]) - 1 # Location of the last pushed element in the array except IndexError: "soft-clipped or trimmed base, not part of the high-qual alignemnt anyways, skip" if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: - + if pileupread.alignment.seq[pileupread.query_position] == ALT: - ALTcount += 1 + ALTcount += 1 # samtools mpileup sometimes counts bases as variants which are neither REF nor ALT - if (pileupread.alignment.seq[pileupread.query_position] != REF) and (pileupread.alignment.seq[pileupread.query_position] != ALT): + if (pileupread.alignment.seq[pileupread.query_position] != REF) and (pileupread.alignment.seq[pileupread.query_position] != ALT): if pileupread.alignment.is_reverse: nonREFnonALTrev += 1 #if DP4ar > 0: DP4ar -= 1 @@ -384,11 +384,11 @@ def performAnalysis(args): # Calculating duplicates based on read-mate pair's start positions (chr id and start location) count_duplicate = BoolCounter(REF, ALT) for key in readMateHash: - value_length = len(readMateHash[key]) - if value_length > 0: + value_length = len(readMateHash[key]) + if value_length > 0: sorted_values = sorted(readMateHash[key], key=lambda x: x[0]) # Sorted based on base quality sorted_values = sorted_values[:-1] # removing the read with highest quality, so it will be retained for count - for value in sorted_values: # Removing everthing else + for value in sorted_values: # Removing everthing else qual_value, decreaseInfo = value remove_base = decreaseInfo[0] remove_is_reverse = decreaseInfo[1] @@ -396,32 +396,32 @@ def performAnalysis(args): count_duplicate.update(remove_base) (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) - if (DP4[2] + DP4[3]) > ALTcount: # that the ALTcount is larger happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling - if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd + if (DP4[2] + DP4[3]) > ALTcount: # that the ALTcount is larger happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling + if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd if DP4ar >= nonREFnonALTrev: DP4ar -= nonREFnonALTrev - + nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd count_nonREFnonALT.update_nonREFnonALT(nonREFnonALT) - + #Format: DP2 -> "reference(forward + reverse), alt(forward + reverse)" supple_dup_str = 'DP2sup=' + ','.join(map(str, count_supple.counted())) supple_dup_str += ';DP2dup=' + ','.join(map(str, count_duplicate.counted())) supple_dup_str += ';DP2pairEnd=' + ','.join(map(str, count_PE.counted())) supple_dup_str += ';DP2mis=' + ','.join(map(str, count_mismatch.counted())) supple_dup_str += ';DP2nonREFnonALT=' + ','.join(map(str, count_nonREFnonALT.counted())) - + ACGTNacgtn1_string = "ACGTNacgtnPLUS="+",".join([str(i) for i in ACGTNacgtn1]) ACGTNacgtn2_string = "ACGTNacgtnMINUS="+",".join([str(i) for i in ACGTNacgtn2]) - info_values[DP4_idx] = "DP4=" + str(DP4rf)+ "," + str(DP4rr)+ "," + str(DP4af)+ "," + str(DP4ar) + info_values[DP4_idx] = "DP4=" + str(DP4rf)+ "," + str(DP4rr)+ "," + str(DP4af)+ "," + str(DP4ar) info_values.append(ACGTNacgtn1_string) info_values.append(ACGTNacgtn2_string) info_values.append(DP4_original) - info_values.append(supple_dup_str) + info_values.append(supple_dup_str) entries[header_indices["INFO"]] = ';'.join(info_values) - + sys.stdout.write('\t'.join(entries) + '\n') else: sys.stdout.write(line) # write germline and somatic-multiallelic SNVs as is @@ -446,7 +446,7 @@ def performAnalysis(args): #vcfInFile.close() #outFile.close() - + if __name__ == '__main__': #print "Starting program...\n" import argparse diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh index 713a5bc..856ef35 100755 --- a/resources/analysisTools/snvPipeline/snvAnnotation.sh +++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh @@ -221,7 +221,7 @@ then NRSOMSNV=`grep -v "^#" ${filenameSomaticSNVsTmp} | wc -l` echo -e "SOMATIC_SNVS_UNFILTERED\t${NRSOMSNV}">> ${filenameQCValues} - mv ${filenameSNVVCFTemp}.tmp ${filenameSNVVCFTemp} + mv ${filenameSNVVCFTemp}.tmp ${filenameSNVVCFTemp} wait ${zipAlternativeAlleleBaseScores} ; [[ $? -gt 0 ]] && echo "Error from zipAlternativeAlleleBaseScores" && exit 31 wait ${zipReferenceAlleleBaseScores} ; [[ $? -gt 0 ]] && echo "Error from zipReferenceAlleleBaseScores" && exit 32 diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index 6049fab..61e4bf7 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -39,15 +39,15 @@ - + - - - + + + - + diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml index 77711f8..d8827b2 100644 --- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml +++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml @@ -10,8 +10,8 @@ - - + + @@ -20,11 +20,11 @@ - + - - + + @@ -42,11 +42,11 @@ - + - + From aef5111697bc116fcd90c6910b141f1ed12f3daf Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Mon, 26 Apr 2021 17:03:33 +0200 Subject: [PATCH 20/49] Uncommenting reference detection --- resources/analysisTools/snvPipeline/snvAnnotation.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh index 856ef35..94a4bbe 100755 --- a/resources/analysisTools/snvPipeline/snvAnnotation.sh +++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh @@ -36,8 +36,8 @@ scriptDirectory=`dirname ${WRAPPED_SCRIPT}` #TOOLS_DIR=${BASEPATH_TOOLS} FILEBASE=`dirname ${FILENAME_VCF_IN}`/`basename ${FILENAME_VCF_IN} .vcf` -#source ${TOOL_ANALYZE_BAM_HEADER} -#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME +source ${TOOL_ANALYZE_BAM_HEADER} +getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME [[ -f ${FILENAME_CHECKPOINT} ]] && rm ${FILENAME_CHECKPOINT} From 22a55ecd90ebee01de7ccfe116a54c339294df7f Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Fri, 30 Apr 2021 11:31:45 +0200 Subject: [PATCH 21/49] updating refgenome help --- .../analysisTools/snvPipeline/confidenceAnnotation_SNVs.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index 22be339..f1b03ae 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -776,8 +776,10 @@ def main(args): parser.add_argument("-g", "--refgenome", dest="refgenome", nargs=2, default=["hs37d5", "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/" \ "phase2_reference_assembly_sequence/hs37d5.fa.gz", ], - help="reference genome used for calling ID, path (default hs37d5, ftp://ftp.1000genomes.ebi" \ - ".ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)") + help="Argument's first value is used to determine if the workflow is using hg19 or hg38 reference genome. " \ + "Allowed first values are 'hs37d5' or 'GRCh37', second value could be the downloaded URL of the reference genome. " \ + "(default hs37d5, ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)" \ + "") parser.add_argument("-z", "--center", dest="center", nargs="?", default="DKFZ", help="Center (unclear if the center where the data was produced or the center of the " \ "calling pipeline; default DKFZ).") From 71900b37e8795b1cbeb316c6284b7b78bda72840 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Fri, 30 Apr 2021 11:40:27 +0200 Subject: [PATCH 22/49] Raise error: unknown alignment suffix --- resources/analysisTools/snvPipeline/filter_PEoverlap.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py index 03db8dc..4807e8a 100755 --- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py +++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py @@ -139,6 +139,8 @@ def performAnalysis(args): elif args.alignmentFile.split(".")[-1] == "cram": mode += "c" samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file) + else: + raise "Unknown file alignment suffix '%s'. Need 'bam' or 'cram'" % args.alignmentFile.split(".")[-1] if args.altPosF != '': ALT_basePositions_file = args.altPosF From dbd835d66161591fc858d7733433c5d341df43a9 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Fri, 30 Apr 2021 11:46:30 +0200 Subject: [PATCH 23/49] Reformatting in_dbSNPcounter.pl --- .../snvPipeline/in_dbSNPcounter.pl | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl index 737ae04..28c1a32 100755 --- a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl +++ b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl @@ -49,11 +49,26 @@ my $i = 0; while($i < @head) { - if($head[$i] eq "DBSNP"){$col{"DBSNP"} = $i; print STDERR "DBSNP in column ", $i+1,"\n";} - if($head[$i] eq "1K_GENOMES"){$col{"1K_GENOMES"} = $i; print STDERR "1K_GENOMES in column ", $i+1,"\n";} - if($head[$i] eq "RECLASSIFICATION"){$col{"RECLASSIFICATION"} = $i; print STDERR "RECLASSIFICATION in column ", $i+1,"\n";} - if($head[$i] eq "ANNOTATION_control"){$col{"ANNOTATION_control"} = $i; print STDERR "ANNOTATION_control in column ", $i+1,"\n";} - if($head[$i] eq "CONFIDENCE"){$col{"CONFIDENCE"} = $i; print STDERR "CONFIDENCE in column ", $i+1,"\n";} + if($head[$i] eq "DBSNP"){ + $col{"DBSNP"} = $i; + print STDERR "DBSNP in column ", $i+1,"\n"; + } + if($head[$i] eq "1K_GENOMES"){ + $col{"1K_GENOMES"} = $i; + print STDERR "1K_GENOMES in column ", $i+1,"\n"; + } + if($head[$i] eq "RECLASSIFICATION"){ + $col{"RECLASSIFICATION"} = $i; + print STDERR "RECLASSIFICATION in column ", $i+1,"\n"; + } + if($head[$i] eq "ANNOTATION_control"){ + $col{"ANNOTATION_control"} = $i; + print STDERR "ANNOTATION_control in column ", $i+1,"\n"; + } + if($head[$i] eq "CONFIDENCE"){ + $col{"CONFIDENCE"} = $i; + print STDERR "CONFIDENCE in column ", $i+1,"\n"; + } $i++; } From df184d9806302e52773e4d7ca0d00da021aa92c8 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Fri, 30 Apr 2021 12:00:11 +0200 Subject: [PATCH 24/49] Reformatting IMD R file --- .../snvPipeline/intermutationDistance_Coord_color.r | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r b/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r index 67ec8b3..faa1757 100755 --- a/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r +++ b/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r @@ -46,8 +46,11 @@ data <- read.table(opt$chrLengthFile, header=FALSE) chrLength <- data.frame(data) rownames(chrLength) <- chrLength$V1 -if((opt$chrPrefix != "" && grepl(opt$chrPrefix, chrLength$V1[1])) && !grepl(opt$chrPrefix, c.array[1])){ - c.array <- paste0(opt$chrPrefix, c.array, opt$chrSuffix) +if(opt$chrPrefix != "" && + grepl(opt$chrPrefix, chrLength$V1[1]) && + !grepl(opt$chrPrefix, c.array[1])){ + + c.array <- paste0(opt$chrPrefix, c.array, opt$chrSuffix) } xtotal <- sum(chrLength[c.array,2]/10) From f93779c5672c47e81de8922911c73b3ca81a0d6e Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Wed, 29 Sep 2021 11:10:32 +0200 Subject: [PATCH 25/49] Increasing the mem requirement --- resources/configurationFiles/analysisSNVCalling.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index 61e4bf7..07ec8de 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -231,8 +231,8 @@ - - + + @@ -247,7 +247,7 @@ - + @@ -261,7 +261,7 @@ - + From 7beae9320de5f063a6654a9e9e5d48b020dc10b7 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Mon, 18 Oct 2021 12:25:24 +0200 Subject: [PATCH 26/49] safer IO for in_dbSNPcounter.pl --- .../snvPipeline/in_dbSNPcounter.pl | 117 +++++++++--------- 1 file changed, 58 insertions(+), 59 deletions(-) diff --git a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl index 28c1a32..0c0b93d 100755 --- a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl +++ b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl @@ -23,7 +23,7 @@ { $minscore = 8; } -open (FH, $file) or die "Could not open $file: $!\n"; +open (my $FH, $file) or die "Could not open $file: $!\n"; # count all, somatic, s. with minconfidence, in dbSNP, in 1KG my $all = 0; @@ -37,75 +37,75 @@ my $header = ""; -while ($header = ) -{ - last if ($header =~ /^\#CHROM/); # that is the line with the column names -} -chomp $header; +my ($DBSBP, $KG, $CONFIDENCE, $CLASSIFICATION); -my @head=split("\t", $header); -my %col; +while(!eof($FH)){ + my $line = readline($FH) || die "Could not read line: $?"; + chomp $line; + if ($line =~ /^#/) + { + #print STDOUT "HELLO\n"; + if ($line =~/^#CHROM/){ + chomp $line; -my $i = 0; -while($i < @head) -{ - if($head[$i] eq "DBSNP"){ - $col{"DBSNP"} = $i; - print STDERR "DBSNP in column ", $i+1,"\n"; - } - if($head[$i] eq "1K_GENOMES"){ - $col{"1K_GENOMES"} = $i; - print STDERR "1K_GENOMES in column ", $i+1,"\n"; - } - if($head[$i] eq "RECLASSIFICATION"){ - $col{"RECLASSIFICATION"} = $i; - print STDERR "RECLASSIFICATION in column ", $i+1,"\n"; - } - if($head[$i] eq "ANNOTATION_control"){ - $col{"ANNOTATION_control"} = $i; - print STDERR "ANNOTATION_control in column ", $i+1,"\n"; - } - if($head[$i] eq "CONFIDENCE"){ - $col{"CONFIDENCE"} = $i; - print STDERR "CONFIDENCE in column ", $i+1,"\n"; - } - $i++; -} + my @head=split("\t", $line); + my %col; -my $DBSBP = $col{"DBSNP"}; -my $KG = $col{"1K_GENOMES"}; -my $CONFIDENCE = $col{"CONFIDENCE"}; -my $CLASSIFICATION = $col{"RECLASSIFICATION"}; + my $i = 0; + while($i < @head) + { + if($head[$i] eq "DBSNP"){ + $col{"DBSNP"} = $i; + print STDERR "DBSNP in column ", $i+1,"\n"; + } + if($head[$i] eq "1K_GENOMES"){ + $col{"1K_GENOMES"} = $i; + print STDERR "1K_GENOMES in column ", $i+1,"\n"; + } + if($head[$i] eq "RECLASSIFICATION"){ + $col{"RECLASSIFICATION"} = $i; + print STDERR "RECLASSIFICATION in column ", $i+1,"\n"; + } + if($head[$i] eq "ANNOTATION_control"){ + $col{"ANNOTATION_control"} = $i; + print STDERR "ANNOTATION_control in column ", $i+1,"\n"; + } + if($head[$i] eq "CONFIDENCE"){ + $col{"CONFIDENCE"} = $i; + print STDERR "CONFIDENCE in column ", $i+1,"\n"; + } + $i++; + } -# I know that it's evilly hardcoded! -while () -{ - if ($_ =~ /^#/) # skip header lines - { - next; + $DBSBP = $col{"DBSNP"}; + $KG = $col{"1K_GENOMES"}; + $CONFIDENCE = $col{"CONFIDENCE"}; + $CLASSIFICATION = $col{"RECLASSIFICATION"}; + } } - $all++; - @help = split ("\t", $_); - if ($help[$CLASSIFICATION] =~ /somatic/) - { - $somatic++; - if ($help[$CONFIDENCE] >= $minscore) + else{ + $all++; + @help = split ("\t", $line); + if ($help[$CLASSIFICATION] =~ /somatic/) { - $scored++; - #print STDERR $_; - if ($help[$DBSBP] =~ /MATCH=exact/) - { - $dbSNP++; - } - if ($help[$KG] =~ /MATCH=exact/) + $somatic++; + if ($help[$CONFIDENCE] >= $minscore) { - $oneKG++; + $scored++; + if ($help[$DBSBP] =~ /MATCH=exact/) + { + $dbSNP++; + } + if ($help[$KG] =~ /MATCH=exact/) + { + $oneKG++; + } } } } } +close $FH; -close FH; if ($scored > 0) { $perc_dbSNP = sprintf ("%.2f", ($dbSNP/$scored*100)); @@ -116,6 +116,5 @@ else # there might be no ... { die "!!! There are no SNVs with confidence >= $minscore in the input, it makes no sense to do further analyses with the current settings on these data!!!\n"; - } exit; From d27035818c8f2294fd678f5669b8fb58165291bf Mon Sep 17 00:00:00 2001 From: paramasi Date: Wed, 4 May 2022 09:51:07 +0200 Subject: [PATCH 27/49] Add variant calling in HLA/ALT contigs --- .../snvPipeline/PurityReloaded.py | 4 +++ .../snvPipeline/createErrorPlots.py | 2 +- .../snvPipeline/filter_PEoverlap.py | 6 +++- .../analysisTools/snvPipeline/snvCalling.sh | 28 ++++++++++++++++--- .../vcf_pileup_compare_allin1_basecount.pl | 14 +++++++--- .../analysisSNVCallingGRCh38.xml | 10 +++++++ 6 files changed, 54 insertions(+), 10 deletions(-) diff --git a/resources/analysisTools/snvPipeline/PurityReloaded.py b/resources/analysisTools/snvPipeline/PurityReloaded.py index e3aadf9..b8dcb7d 100755 --- a/resources/analysisTools/snvPipeline/PurityReloaded.py +++ b/resources/analysisTools/snvPipeline/PurityReloaded.py @@ -251,6 +251,10 @@ def parseVcf(file,num): while (l!= ""): t=l.split('\t') if (t[0][0] != "#") and isValid(t): + # Skiping the non-primary assembly variants from purity calculations + if t[0].startswith('HLA') or t[0].endswith('_alt'): + l=vcf.readline() + continue i = chromMap[t[0]] if (t[12]=="germline"): #DP5=string.split(string.split(t[11],";")[1],",") diff --git a/resources/analysisTools/snvPipeline/createErrorPlots.py b/resources/analysisTools/snvPipeline/createErrorPlots.py index 2ed2416..dc7dbfc 100755 --- a/resources/analysisTools/snvPipeline/createErrorPlots.py +++ b/resources/analysisTools/snvPipeline/createErrorPlots.py @@ -31,7 +31,7 @@ def calculateLogSize(size, max_val, base): return math.log(((size/max_val)*(base-1.))+1., base) def calculateRootedSize(size, max_val): - if(float(size) != 0.0): + if(float(size) != 0.0 and float(max_val) != 0.0): return np.sqrt(size/max_val) else: return 0.0 diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py index 4807e8a..6d19fcf 100755 --- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py +++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py @@ -210,7 +210,11 @@ def performAnalysis(args): count_nonREFnonALT = BoolCounter(REF, ALT) # " for nonREF and nonALT bases # To match pysam and mpileup counts, a reference file is added. Given the reference file, Pysam by default computes BAQ (compute_baq). - for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=3844, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators): + if chrom.endswith('alt') or chrom.startswith('HLA'): + flag_filer_value = 1796 + else: + flag_filer_value = 3844 + for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=flag_filer_value, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators): if pileupcolumn.pos == (pos-1): #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments) for pileupread in pileupcolumn.pileups: diff --git a/resources/analysisTools/snvPipeline/snvCalling.sh b/resources/analysisTools/snvPipeline/snvCalling.sh index de88566..d931226 100755 --- a/resources/analysisTools/snvPipeline/snvCalling.sh +++ b/resources/analysisTools/snvPipeline/snvCalling.sh @@ -30,7 +30,12 @@ getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFI chr="" if [[ -n "$PARM_CHR_INDEX" ]] then - chr=${CHR_PREFIX}${PARM_CHR_INDEX}${CHR_SUFFIX} + if [[ "$PARM_CHR_INDEX" == "ALT" || "$PARM_CHR_INDEX" == "HLA" ]] + then + chr=${PARM_CHR_INDEX} + else + chr=${CHR_PREFIX}${PARM_CHR_INDEX}${CHR_SUFFIX} + fi else chr=${CHR_PREFIX}${CHROMOSOME_INDICES[$((RODDY_JOBARRAYINDEX-1))]}${CHR_SUFFIX} # File names can contain X and Y. The pattern contains RODDY_JOBARRAYINDEX so this has to be changed temporary as the value is always numeric! @@ -66,8 +71,23 @@ filenameMPileupError="${MPILEUP_SUBDIR}/${MPILEUPOUT_PREFIX}${PID}.${chr}.error" FILENAME_VCF_SNVS_TEMP=${FILENAME_VCF_SNVS}.tmp -#[[ ! -f $filenameMPileupTemp ]] && -${BCFTOOLS_BINARY} mpileup ${MPILEUP_OPTS} -r ${chr} -f ${REFERENCE_GENOME} $tumorbamfullpath | ${BCFTOOLS_BINARY} call ${BCFTOOLS_OPTS} > $filenameMPileupTemp +## For variant calling from ALT and HLA region in the GRCh38 genome +# The *_CONTIGS_FILE contains the list of the contigs names +MPILEUP_REGION_OPTION="" +if [[ "$chr" == "HLA" ]] +then + MPILEUP_REGION_OPTION="-R ${CHROMOSOME_HLA_CONTIGS_FILE}" + MPILEUP_OPTS=$MPILEUP_OPTS_ALT_HLA +elif [[ "$chr" == "ALT" ]] +then + MPILEUP_REGION_OPTION="-R ${CHROMOSOME_ALT_CONTIGS_FILE}" + MPILEUP_OPTS=$MPILEUP_OPTS_ALT_HLA +else + MPILEUP_REGION_OPTION="-r ${chr}" +fi + +#[[ ! -f $filenameMPileupTemp ]] && +${BCFTOOLS_BINARY} mpileup ${MPILEUP_OPTS} ${MPILEUP_REGION_OPTION} -f ${REFERENCE_GENOME} $tumorbamfullpath | ${BCFTOOLS_BINARY} call ${BCFTOOLS_OPTS} > $filenameMPileupTemp if [[ "$?" != 0 ]] then @@ -103,7 +123,7 @@ if [[ -n "$firstLineVCF" ]]; then mkfifo $NP_MPILEUP # if there is a germline BAM, first look up these positions in the control file by just piling up the bases, this is NO SNV calling - ${SAMTOOLS_BINARY} mpileup ${MPILEUPCONTROL_OPTS} -r ${chr} -l ${filenameMPileupOut} -f ${REFERENCE_GENOME} ${CONTROL_BAMFILE_FULLPATH_BP} > ${NP_MPILEUP} & + ${SAMTOOLS_BINARY} mpileup ${MPILEUPCONTROL_OPTS} -l ${filenameMPileupOut} -f ${REFERENCE_GENOME} ${CONTROL_BAMFILE_FULLPATH_BP} > ${NP_MPILEUP} & ${PERL_BINARY} ${TOOL_VCF_PILEUP_COMPARE} ${filenameMPileupOut} $NP_MPILEUP "Header" > ${FILENAME_VCF_SNVS_TEMP} rm $NP_MPILEUP diff --git a/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl b/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl index e455d0a..7700ab8 100755 --- a/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl +++ b/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl @@ -136,6 +136,9 @@ my $ccoord = -1; # make sure that ccoord in first iteration is smaller than tcoord my $tcoord = 0; +my $current_c_chr = ""; +my $current_t_chr = ""; + # one file is longer than the other, nevermind which #(if we did pileup for only the tumor SNV positions, control will be shorter; otherwise longer) TUMORFILE_LOOP: while ($lineT=) @@ -150,7 +153,9 @@ chomp $lineT; @tum = split (/\s+/, $lineT); $tcoord = $tum[1]; # start coordinate - if ($tcoord < $ccoord) # no matching control line => tumor position not covered in control (never true in 1st iteration) + $current_t_chr = $tum[0]; + + if ($tcoord < $ccoord && $current_t_chr eq $current_c_chr) # no matching control line => tumor position not covered in control (never true in 1st iteration) { $missingC++; # print the tumor line with according "flag" @@ -158,7 +163,7 @@ print $lineT, "\tDP=0;DP5=0,0,0,0,0;DP5all=0,0,0,0,0;ACGTNacgtnHQ=0,0,0,0,0,0,0,0,0,0;ACGTNacgtn=0,0,0,0,0,0,0,0,0,0;VAF=0;TSR=0\t$notcovered\n"; next; # read next line from tumor } - if ($tcoord == $ccoord) # matching pair found! + if ($tcoord == $ccoord && $current_t_chr eq $current_c_chr) # matching pair found! { $match++; chomp $lineC; @@ -176,15 +181,16 @@ $ctrC++; # split lines @ctrl = split (/\s+/, $lineC); + $current_c_chr = $ctrl[0]; $ccoord = $ctrl[1]; - if ($tcoord == $ccoord) # matching pair found! + if ($tcoord == $ccoord && $current_t_chr eq $current_c_chr) # matching pair found! { $match++; # do the evaluation in a subroutine evaluate_pos(); next TUMORFILE_LOOP; # and read next line from tumor } - if ($tcoord < $ccoord) # new ccoord is higher than tcoord => tumor position not covered in control (should never be the case!) + if ($tcoord < $ccoord && $current_t_chr eq $current_c_chr) # new ccoord is higher than tcoord => tumor position not covered in control (should never be the case!) { $missingC++; print $lineT, "\tDP=0;DP5=0,0,0,0,0;DP5all=0,0,0,0,0;ACGTNacgtnHQ=0,0,0,0,0,0,0,0,0,0;ACGTNacgtn=0,0,0,0,0,0,0,0,0,0;VAF=0;TSR=0\t$notcovered\n"; diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml index d8827b2..974b29c 100644 --- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml +++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml @@ -18,6 +18,16 @@ + + + + + + + + + + From b683df259d3528269448835a1a154b3dc7d2678f Mon Sep 17 00:00:00 2001 From: paramasi Date: Wed, 4 May 2022 09:51:53 +0200 Subject: [PATCH 28/49] Update ngs_share path --- resources/configurationFiles/analysisSNVCalling.xml | 4 ++-- resources/configurationFiles/analysisSNVCallingGRCh38.xml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index 07ec8de..8ad7427 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -45,8 +45,8 @@ - - + + diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml index 974b29c..d1c2e79 100644 --- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml +++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml @@ -10,7 +10,7 @@ - + From 1c0bc26745028ac3c8cbc8c72977ac1ba935e00d Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 30 May 2022 11:01:02 +0200 Subject: [PATCH 29/49] Remove RE/MAP from hg38 penalties --- .../snvPipeline/confidenceAnnotation_SNVs.py | 121 +++++++++--------- 1 file changed, 60 insertions(+), 61 deletions(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index f1b03ae..d1089cf 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -311,39 +311,38 @@ def main(args): reasons += "bias_filter_round2_SEQ(-3)" filterfield["BI"] = 1 - # 2) annotations of regions that cause problems: some classes of repeats from RepeatMasker track, - # segmental duplications, (cf. Reumers et al. 2012, Nature Biotech 30:61), external blacklists, mapability - # simple repeats and low complexity (not the same as homopolymer, but similar enough); - # some satellites are not annotated in blacklist ... - if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]): - is_repeat = True - confidence -= 2 - reasons += "Simple_repeat(-2)" - filterfield["RE"] = 1 - # other repeat elements to penalize at least a bit - elif help["REPEAT_MASKER_VALID"]: - confidence -= 1 - reasons += "Other_repeat(-1)" - filterfield["RE"] = 1 - - # simple tandem repeats most often coincide with other bad features - do not penalize twice - if help["SIMPLE_TANDEMREPEATS_VALID"]: - is_STR = 1 - if not is_repeat: + # Only for hg19 reference genome + if args.refgenome[0] == "hs37d5": + # 2) annotations of regions that cause problems: some classes of repeats from RepeatMasker track, + # segmental duplications, (cf. Reumers et al. 2012, Nature Biotech 30:61), external blacklists, mapability + # simple repeats and low complexity (not the same as homopolymer, but similar enough); + # some satellites are not annotated in blacklist ... + if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]): + is_repeat = True confidence -= 2 - reasons += "Tandem_repeat(-2)" + reasons += "Simple_repeat(-2)" + filterfield["RE"] = 1 + # other repeat elements to penalize at least a bit + elif help["REPEAT_MASKER_VALID"]: + confidence -= 1 + reasons += "Other_repeat(-1)" filterfield["RE"] = 1 - # Segmental Duplications are less effective than homopolymers, short tandem repeats and microsatellites, - # do not penality twice - if help["ANNOVAR_SEGDUP_COL_VALID"] and not (is_repeat or is_STR): - confidence -= 2 # bad region - is_weird = True - reasons += "Segmental_dup(-2)" - filterfield["RE"] = 1 + # simple tandem repeats most often coincide with other bad features - do not penalize twice + if help["SIMPLE_TANDEMREPEATS_VALID"]: + is_STR = 1 + if not is_repeat: + confidence -= 2 + reasons += "Tandem_repeat(-2)" + filterfield["RE"] = 1 - # Only for hg19 reference genome - if args.refgenome[0] == "hs37d5": + # Segmental Duplications are less effective than homopolymers, short tandem repeats and microsatellites, + # do not penality twice + if help["ANNOVAR_SEGDUP_COL_VALID"] and not (is_repeat or is_STR): + confidence -= 2 # bad region + is_weird = True + reasons += "Segmental_dup(-2)" + filterfield["RE"] = 1 # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]: @@ -360,44 +359,44 @@ def main(args): reasons += "Hiseqdepth(-3)" filterfield["HSDEPTH"] = 1 - # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ... - # Everything with really high number of occurences is artefacts - # does not always correlate with the above regions - # is overestimating badness bc. of _single_ end read simulations - if help["MAPABILITY"] == ".": - # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0 - confidence -= 5 - reasons += "Not_mappable(-5)" - filterfield["MAP"] = 1 - else: - reduce = 0 - mapability = min(map(float, help["MAPABILITY"].split("&"))) - if mapability < 0.5: - # 0.5 does not seem to be that bad: region appears another time in - # the genome and we have paired end data! - confidence -= 1 - reduce += 1 - - is_weird = True # something _is_ weird already there and known SNPs might be artefacts - - if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad + # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ... + # Everything with really high number of occurences is artefacts + # does not always correlate with the above regions + # is overestimating badness bc. of _single_ end read simulations + if help["MAPABILITY"] == ".": + # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0 + confidence -= 5 + reasons += "Not_mappable(-5)" + filterfield["MAP"] = 1 + else: + reduce = 0 + mapability = min(map(float, help["MAPABILITY"].split("&"))) + if mapability < 0.5: + # 0.5 does not seem to be that bad: region appears another time in + # the genome and we have paired end data! confidence -= 1 reduce += 1 - if mapability < 0.25: # > 4 times appearing region - confidence -= 1 - reduce += 1 + is_weird = True # something _is_ weird already there and known SNPs might be artefacts - if mapability < 0.1: # > 5 times is bad - confidence -= 2 - reduce += 2 + if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad + confidence -= 1 + reduce += 1 - if mapability < 0.05: # these regions are clearly very bad (Lego stacks) - confidence -= 3 - reduce += 3 + if mapability < 0.25: # > 4 times appearing region + confidence -= 1 + reduce += 1 - filterfield["MAP"] = 1 - reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce) + if mapability < 0.1: # > 5 times is bad + confidence -= 2 + reduce += 2 + + if mapability < 0.05: # these regions are clearly very bad (Lego stacks) + confidence -= 3 + reduce += 3 + + filterfield["MAP"] = 1 + reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce) # if others have found the SNP already, it may be interesting despite low score # - but only if it's not a weird region. From 6d479821b36cb8d700ba9d0ff10e6d2474fdc0c5 Mon Sep 17 00:00:00 2001 From: paramasi Date: Fri, 8 Jul 2022 12:51:35 +0200 Subject: [PATCH 30/49] Add m2e2,HLA/ALT mappability files --- resources/configurationFiles/analysisSNVCallingGRCh38.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml index d1c2e79..1bb222f 100644 --- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml +++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml @@ -20,8 +20,8 @@ - - + + @@ -59,7 +59,7 @@ - + @@ -75,4 +75,4 @@ - \ No newline at end of file + From 01567ac14d9c977dd33904737b5b4921cf764f5d Mon Sep 17 00:00:00 2001 From: paramasi Date: Fri, 16 Sep 2022 15:07:41 +0200 Subject: [PATCH 31/49] Fix the diagnostic plots for GRCh38 --- resources/analysisTools/snvPipeline/filter_vcf.sh | 2 +- resources/analysisTools/snvPipeline/snvsPerChromPlot.r | 1 + resources/configurationFiles/analysisSNVCalling.xml | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh index 0e8f226..2ae1534 100755 --- a/resources/analysisTools/snvPipeline/filter_vcf.sh +++ b/resources/analysisTools/snvPipeline/filter_vcf.sh @@ -24,7 +24,7 @@ getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFI ALIGNMENT_FOLDER=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}` # bugfix: ensure to interpret CHROMOSOME_INDICES as array - otherwise TOOL_INTERMUTATION_DISTANCE_COORD_COLOR will fail... -declare -a CHROMOSOME_INDICES="${CHROMOSOME_INDICES}" +declare -a CHROMOSOME_INDICES="${CHROMOSOME_INDICES_PLOTTING}" numberOfChromosomes=${CHROMOSOME_INDICES[@]} outputFilenamePrefix=${mpileupDirectory}/${SNVFILE_PREFIX}${PID} outputFilenamePrefix_original=${outputFilenamePrefix} diff --git a/resources/analysisTools/snvPipeline/snvsPerChromPlot.r b/resources/analysisTools/snvPipeline/snvsPerChromPlot.r index 3e6c4cb..b4f3627 100755 --- a/resources/analysisTools/snvPipeline/snvsPerChromPlot.r +++ b/resources/analysisTools/snvPipeline/snvsPerChromPlot.r @@ -48,6 +48,7 @@ dat$chromosome <- factor(dat$chromosome, levels = paste0("chr",c(seq(1,22),"X"," chromLength = read.table(file = opt$chromLengthFile, header = F) +chromLength$V1 = gsub("chr", "", chromLength$V1) rownames(chromLength) = paste0("chr",chromLength$V1) chromLength = chromLength[,2, drop= F] chromLengthMB = round(chromLength/1000000) diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index 8ad7427..620a8ad 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -127,6 +127,8 @@ + + From 6321e0ac5634e3256b000af6efcd6e031003b73e Mon Sep 17 00:00:00 2001 From: paramasi Date: Fri, 16 Sep 2022 15:11:11 +0200 Subject: [PATCH 32/49] Remove the quote for RAW_SNV_FILTER_OPTIONS --- resources/analysisTools/snvPipeline/snvCalling.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/analysisTools/snvPipeline/snvCalling.sh b/resources/analysisTools/snvPipeline/snvCalling.sh index d931226..fc7d668 100755 --- a/resources/analysisTools/snvPipeline/snvCalling.sh +++ b/resources/analysisTools/snvPipeline/snvCalling.sh @@ -106,7 +106,7 @@ firstLineVCF=`cat $filenameMPileupTemp | grep -v "^#" | head -n1` if [[ -n "$firstLineVCF" ]]; then # filter out mutations with strand bias next to the motif GGnnGG (or CCnnCC if snv is on reverse strand) ${PERL_BINARY} "$TOOL_SEQ_CONTEXT_ANNOTATOR" "$FASTAFROMBED_BINARY" "$filenameMPileupTemp" "$REFERENCE_GENOME" 10 "$TOOLS_DIR" | \ - ${PYTHON_BINARY} "$TOOL_RAW_SNV_FILTER" --outf="$snvOut" "$RAW_SNV_FILTER_OPTIONS" # --inf=$MPILEUP_SUBDIR/forStrandBiasFilter.${chr}.bed + ${PYTHON_BINARY} "$TOOL_RAW_SNV_FILTER" --outf="$snvOut" ${RAW_SNV_FILTER_OPTIONS} # --inf=$MPILEUP_SUBDIR/forStrandBiasFilter.${chr}.bed if [[ "$?" == 0 ]] then From 0adf1e4527aae48c04f7200d3f4173099a6a40fc Mon Sep 17 00:00:00 2001 From: paramasi Date: Thu, 6 Oct 2022 09:17:28 +0200 Subject: [PATCH 33/49] Upgrade to gencodev39 for hg38 --- resources/configurationFiles/analysisSNVCallingGRCh38.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml index 1bb222f..47bb543 100644 --- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml +++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml @@ -48,10 +48,10 @@ - + - - + + Date: Mon, 7 Nov 2022 08:55:00 +0100 Subject: [PATCH 34/49] Bug fix with dbSNP counter --- resources/analysisTools/snvPipeline/in_dbSNPcounter.pl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl index 0c0b93d..c97155b 100755 --- a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl +++ b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl @@ -80,7 +80,12 @@ $DBSBP = $col{"DBSNP"}; $KG = $col{"1K_GENOMES"}; $CONFIDENCE = $col{"CONFIDENCE"}; - $CLASSIFICATION = $col{"RECLASSIFICATION"}; + + if ($col{"ANNOTATION_control"} > 0) { + $CLASSIFICATION = $col{"ANNOTATION_control"}; + } else { + $CLASSIFICATION = $col{"RECLASSIFICATION"}; + } } } else{ From dc2ca36ee3223c965876aede00c18423461abdd7 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 7 Nov 2022 08:55:44 +0100 Subject: [PATCH 35/49] Update WGS local control --- resources/configurationFiles/analysisSNVCallingGRCh38.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml index 47bb543..58c5503 100644 --- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml +++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml @@ -43,7 +43,7 @@ - + From 973ae9c57fa235fac12b9145f5f67a637ab22cc6 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 7 Nov 2022 09:03:51 +0100 Subject: [PATCH 36/49] hg38: Add local control and gnomAD based confidence annotation --- .../snvPipeline/confidenceAnnotation_SNVs.py | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index d1089cf..5663dd2 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -83,6 +83,7 @@ def main(args): '##FILTER=\n' \ '##FILTER=\n' \ '##FILTER=\n' \ + '##FILTER=0.1%) or in local control database (>0.05%)">\n' \ '##SAMPLE=\n' \ '##SAMPLE=\n'\ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' @@ -119,12 +120,12 @@ def main(args): variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", } - if args.no_control: + if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$" variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$" variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$" variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$" - else: + if not args.no_control: fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ] header_indices = get_header_indices(headers, args.configfile, fixed_headers, variable_headers) @@ -201,13 +202,15 @@ def main(args): is_weird = False # coindicende with known artefact-producing regions if args.no_control: classification = "somatic" # start with default somatic + else: + # for potential re-classification (e.g. low coverage in control and in dbSNP => probably germline) + classification = help["ANNOTATION_control"] # start with original classification + + if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: inGnomAD_WES = False inGnomAD_WGS = False inLocalControl_WES = False inLocalControl_WGS = False - else: - # for potential re-classification (e.g. low coverage in control and in dbSNP => probably germline) - classification = help["ANNOTATION_control"] # start with original classification ### For pancancer # genotype tumor as originally from mpileup @@ -244,26 +247,27 @@ def main(args): if "COMMON=1" in help["DBSNP_COL"]: is_commonSNP = True - if args.no_control: + if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: if indbSNP and is_commonSNP and not is_clinic: - reasons += "dbSNP(NoControl)" + #reasons += "dbSNP(NoControl)" + pass if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): inGnomAD_WES = True infofield["gnomAD_Exomes"] = "gnomAD_Exomes" - reasons += "gnomAD_Exomes(NoControl)" + #reasons += "gnomAD_Exomes(NoControl)" if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))): inGnomAD_WGS = True infofield["gnomAD_Genomes"] = "gnomAD_Genomes" - reasons += "gnomAD_Genomes(NoControl)" + #reasons += "gnomAD_Genomes(NoControl)" if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))): inLocalControl_WGS = True infofield["LocalControl_WGS"] = "LocalControl_WGS" - reasons += "LocalControl_WGS(NoControl)" + #reasons += "LocalControl_WGS(NoControl)" if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))): inLocalControl_WES = True infofield["LocalControl_WES"] = "LocalControl_WES" - reasons += "LocalControl_WES(NoControl)" + #reasons += "LocalControl_WES(NoControl)" # Punish for biases round 1 @@ -312,7 +316,7 @@ def main(args): filterfield["BI"] = 1 # Only for hg19 reference genome - if args.refgenome[0] == "hs37d5": + if args.refgenome[0] == "hs37d5" and not args.skipREMAP: # 2) annotations of regions that cause problems: some classes of repeats from RepeatMasker track, # segmental duplications, (cf. Reumers et al. 2012, Nature Biotech 30:61), external blacklists, mapability # simple repeats and low complexity (not the same as homopolymer, but similar enough); @@ -397,6 +401,17 @@ def main(args): filterfield["MAP"] = 1 reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce) + # Nov 2022: If the variant is present in the gnomAD or local control WGS + # the confidence are reduced and thrown out. This is implemented for hg38 and could be + # used with skipREMAP option for hg19. + # inLocalControl_WES: Needs to be generated from a new hg38 dataset + filterfield["FREQ"] = 0 + if(args.refgenome[0] == 'GRCh38' or args.skipREMAP): + if(inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS): + reasons += 'commonSNP_or_technicalArtifact(-3)' + classification = "SNP_support_germline" + confidence -= 3 + filterfield["FREQ"] = 1 # if others have found the SNP already, it may be interesting despite low score # - but only if it's not a weird region. @@ -590,7 +605,9 @@ def main(args): reasons += "Germline_ALT<0.3(-2)" filterfield["FRC"] = 1 if in1KG or (indbSNP and not (is_precious or is_clinic)): # but this supports again - number of reads may be low! - classification += "_SNP_support" + if filterfield['FREQ'] == 0: + classification += "_SNP_support" + if depthC <= 10: # very probably germline due to skewed distribution at low coverage classification += "_lowCov" # => can end up as "germline_SNP_support_lowCov" @@ -684,7 +701,7 @@ def main(args): filters_line = [] if entries[6] == "" else entries[6].split(';') if args.pancanout is not None: filters_pancan = [] - for filter in ("RE","BL","DP","SB","TAC","dbSNP","DB","HSDEPTH","MAP","SBAF","FRQ","TAR","UNCLEAR","DPHIGH","DPLOWC","1PS","ALTC","ALTCFR","FRC","YALT","VAF","BI"): + for filter in ("RE","BL","DP","SB","TAC","dbSNP","DB","HSDEPTH","MAP","SBAF","FRQ","TAR","UNCLEAR","DPHIGH","DPLOWC","1PS","ALTC","ALTCFR","FRC","YALT","VAF","BI", "FREQ"): if filterfield.get(filter, 0) == 1: if args.pancanout is not None: filters_pancan.append(filter) @@ -801,6 +818,7 @@ def main(args): parser.add_argument("-x", "--runexome", dest="runexome", action="store_true", default=False, help="Run on exome, will turn off the high control coverage punishment " \ "and the PCR bias filter.") + parser.add_argument('--skipREMAP', dest='skipREMAP', action='store_true', default=False) parser.add_argument("--gnomAD_WGS_maxMAF", dest="gnomAD_WGS_maxMAF", help="Max gnomAD WGS MAF", default=0.001, type=float) parser.add_argument("--gnomAD_WES_maxMAF", dest="gnomAD_WES_maxMAF", help="Max gnomAD WES MAF", default=0.001, type=float) parser.add_argument("--localControl_WGS_maxMAF", dest="localControl_WGS_maxMAF", help="Max local control WGS MAF", default=0.01, type=float) From 56aa726ec3e7d097b9e57edf32aa39d8b747ef80 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 7 Nov 2022 13:48:35 +0100 Subject: [PATCH 37/49] Update README --- README.md | 11 +++++++++++ buildinfo.txt | 2 +- buildversion.txt | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 48141d3..5d28343 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,17 @@ The optional configuration JSON file defaults to the `convertToStdVCF.json` resi ## Changelog +* 3.0.0 + * Major + * Support for hg38/GRCh38 reference genome and variant calling from ALT and HLA contigs. + * Minor + * For hg38: Removing mappability and repeat elements' annotations from penalty calculations. + * skipREMAP: Option to remove repeat elements and mappability from confidence annotations in hg19. + * Removing EVS And ExAC AF from the annotations and no-control workflow filtering + * Support for variant calling from CRAM files + * Bug fix: Removing "quote" around the raw filter option `` + * Update `COWorkflowsBasePlugin` to `1.4.2` + * 2.2.0 * minor: Added `vcfConvertToStd.py`; slightly modified from code by @juleskers and Sophia Stahl diff --git a/buildinfo.txt b/buildinfo.txt index 7730d00..df81708 100755 --- a/buildinfo.txt +++ b/buildinfo.txt @@ -1,4 +1,4 @@ -dependson=COWorkflowsBasePlugin:1.3.0 +dependson=COWorkflowsBasePlugin:1.4.2 JDKVersion=1.8 GroovyVersion=2.4 RoddyAPIVersion=3.4 diff --git a/buildversion.txt b/buildversion.txt index 228b97b..fd8c949 100755 --- a/buildversion.txt +++ b/buildversion.txt @@ -1,2 +1,2 @@ -2.1 +3.0 0 From da45d2a91782ebca913879debfbeab43d4edc94e Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 7 Nov 2022 13:50:19 +0100 Subject: [PATCH 38/49] Exempt classification with FREQ --- .../analysisTools/snvPipeline/confidenceAnnotation_SNVs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index 5663dd2..1d32592 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -409,7 +409,7 @@ def main(args): if(args.refgenome[0] == 'GRCh38' or args.skipREMAP): if(inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS): reasons += 'commonSNP_or_technicalArtifact(-3)' - classification = "SNP_support_germline" + #classification = "SNP_support_germline" confidence -= 3 filterfield["FREQ"] = 1 From 45fa141ae7800a7873a0b71be482350bae6550a6 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 20 Mar 2023 09:05:41 +0100 Subject: [PATCH 39/49] Reverting SNP based confidence scoring --- .../analysisTools/snvPipeline/confidenceAnnotation_SNVs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index 1d32592..1cf7c1a 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -408,9 +408,9 @@ def main(args): filterfield["FREQ"] = 0 if(args.refgenome[0] == 'GRCh38' or args.skipREMAP): if(inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS): - reasons += 'commonSNP_or_technicalArtifact(-3)' + #reasons += 'commonSNP_or_technicalArtifact(-3)' #classification = "SNP_support_germline" - confidence -= 3 + #confidence -= 3 filterfield["FREQ"] = 1 # if others have found the SNP already, it may be interesting despite low score From 5479faf3d69036c9c2aad9a03a26db1a116cca27 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 20 Mar 2023 09:06:54 +0100 Subject: [PATCH 40/49] Add exception for 'N' in createErrorPlots.py --- resources/analysisTools/snvPipeline/createErrorPlots.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/resources/analysisTools/snvPipeline/createErrorPlots.py b/resources/analysisTools/snvPipeline/createErrorPlots.py index dc7dbfc..902eed2 100755 --- a/resources/analysisTools/snvPipeline/createErrorPlots.py +++ b/resources/analysisTools/snvPipeline/createErrorPlots.py @@ -199,6 +199,10 @@ def calculateErrorMatrix(vcfFilename, referenceFilename, errorType): # 23.05.2016 JB: Excluded multiallelic SNVs if ',' in split_line[header.index("ALT")]: continue + # 21.02.2023 NP: Excluded SNVs with 'N' before or after "," in context + if 'N,' in split_line[header.index("SEQUENCE_CONTEXT")] or ',N' in split_line[header.index("SEQUENCE_CONTEXT")]: + continue + chrom = split_line[header.index("CHROM")] pos = int(split_line[header.index("POS")]) context = "" From f633856df579de6627a47686c47cbb7c906574cb Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 20 Mar 2023 09:08:20 +0100 Subject: [PATCH 41/49] Remove NA values in quantile calculation --- .../snvPipeline/tripletBased_BQDistribution_plotter.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R b/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R index d57fcb5..07b6bff 100755 --- a/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R +++ b/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R @@ -474,7 +474,7 @@ plot_BQD_to_pdf = function(PDF_OUTPUT_FILE, data.bq.triplet, molten.alt$nBases=transitionSubset[molten.alt$sample,"nBQ.alt"] molten.alt[,ReadPositionsQuantile_ColName] = sapply(transitionSubset[molten.alt$sample,"RP_string.alt"], function(positionsString) { positions = as.integer(unlist(strsplit(positionsString, ","))) - return (round(quantile(positions, ReadPositionQuantile))) + return (round(quantile(positions, ReadPositionQuantile, na.rm = TRUE))) }) # molten.altReadPos = melt(baseScores.alt.counts.normalized.cumul, id.vars = "sample") if (whatToPlot == "BQD_sampleIndividual_ColoredByChromosome") { From ea7154be10a87a614be904f4f9fcfecaf7de91f0 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 20 Mar 2023 09:08:49 +0100 Subject: [PATCH 42/49] Update reference --- .../configurationFiles/analysisSNVCallingGRCh38.xml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml index 58c5503..4cdb729 100644 --- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml +++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml @@ -13,15 +13,15 @@ - - - - + + + + - - + + From 3f8419cc3f5db1c1ecc20653c9d18820f9af467a Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 20 Mar 2023 09:10:42 +0100 Subject: [PATCH 43/49] Update raw_filter_punishment in accordance with RAW_SNV_FILTER_OPTIONS --- .../analysisTools/snvPipeline/confidenceAnnotation_SNVs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index 1cf7c1a..5c1c5a1 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -653,7 +653,7 @@ def main(args): #To make sure that changes in the raw filter do not influence the final result we punish them with -3 # TODO: JB: Why we use the second decimal of orignal value here? - if not args.runlowmaf and ((float(entries[5]) < 3 and (float("%.2f"%fr_var_tum) < 0.05 or (tvf + tvr < 5))) or (float(entries[5]) < 20 and ((tvf == 0 or tvr == 0) and (tvf + tvr <= 3)))): + if not args.runlowmaf and ((float(entries[5]) < 3 and (float("%.2f"%fr_var_tum) < 0.03 or (tvf + tvr < 3))) or (float(entries[5]) < 20 and ((tvf == 0 or tvr == 0) and (tvf + tvr <= 3)))): confidence -= 3 reasons += "raw_filter_punishment(-3)" filterfield["FRQ"] = 1 From 13bdc5f5957299b30c84a54d775247d8afcf5386 Mon Sep 17 00:00:00 2001 From: paramasi Date: Thu, 23 Mar 2023 09:07:48 +0100 Subject: [PATCH 44/49] Move python env --- .../analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh index e1aeb9d..991202f 100755 --- a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh +++ b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh @@ -19,7 +19,7 @@ module load bedtools/"${BEDTOOLS_VERSION:?BEDTOOLS_VERSION undefined}" module load pypy/"${PYPY_VERSION:?PYPY_VERSION undefined}" set +ue -source /dkfz/cluster/virtualenvs/paramasi/python_2.7.9_SNVCalling_pysam_0.16.0.1/bin/activate +source /omics/groups/OE0246/shared/paramasi/compute_env/python_2.7.9_SNVCalling_pysam_0.16.0.1/bin/activate set -ue export BGZIP_BINARY=bgzip From e2f7db944ebc79a4f49050ce396c373fd55fc5f4 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 12 Jun 2023 12:21:29 +0200 Subject: [PATCH 45/49] Update virtual env path --- .../dkfz_lsf_virtual_snv_env_packages.txt | 19 +++++++++++++++++++ .../environments/tbi-lsf-cluster.sh | 2 +- .../configurationFiles/analysisSNVCalling.xml | 2 ++ 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt diff --git a/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt b/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt new file mode 100644 index 0000000..2399198 --- /dev/null +++ b/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt @@ -0,0 +1,19 @@ +Package Version +--------------- -------- +biopython 1.57 +Cython 0.27.3 +funcsigs 1.0.2 +matplotlib 1.4.3 +mock 2.0.0 +nose 1.3.7 +numpy 1.11.3 +pbr 3.1.1 +pip 20.3.4 +pyparsing 2.2.0 +pysam 0.16.0.1 +python-dateutil 2.6.1 +pytz 2017.3 +scipy 0.12.0 +setuptools 36.8.0 +six 1.11.0 +wheel 0.30.0 diff --git a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh index 991202f..1584d10 100755 --- a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh +++ b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh @@ -19,7 +19,7 @@ module load bedtools/"${BEDTOOLS_VERSION:?BEDTOOLS_VERSION undefined}" module load pypy/"${PYPY_VERSION:?PYPY_VERSION undefined}" set +ue -source /omics/groups/OE0246/shared/paramasi/compute_env/python_2.7.9_SNVCalling_pysam_0.16.0.1/bin/activate +source "${DKFZ_LSF_VIRTUAL_SNV_ENV} set -ue export BGZIP_BINARY=bgzip diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index 620a8ad..99c8019 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -166,6 +166,8 @@ + + From 44614b6a3dbffe9f4b798a7340ee6c73e0f1d367 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 12 Jun 2023 12:22:34 +0200 Subject: [PATCH 46/49] XML comments to description --- .../snvPipeline/PurityReloaded.py | 2 +- .../snvPipeline/createErrorPlots.py | 2 +- .../analysisTools/snvPipeline/filter_vcf.sh | 2 +- .../analysisSNVCallingGRCh38.xml | 26 +++++++------------ 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/resources/analysisTools/snvPipeline/PurityReloaded.py b/resources/analysisTools/snvPipeline/PurityReloaded.py index b8dcb7d..f67c907 100755 --- a/resources/analysisTools/snvPipeline/PurityReloaded.py +++ b/resources/analysisTools/snvPipeline/PurityReloaded.py @@ -251,7 +251,7 @@ def parseVcf(file,num): while (l!= ""): t=l.split('\t') if (t[0][0] != "#") and isValid(t): - # Skiping the non-primary assembly variants from purity calculations + # Skipping the non-primary assembly variants from purity calculations if t[0].startswith('HLA') or t[0].endswith('_alt'): l=vcf.readline() continue diff --git a/resources/analysisTools/snvPipeline/createErrorPlots.py b/resources/analysisTools/snvPipeline/createErrorPlots.py index 902eed2..a528de6 100755 --- a/resources/analysisTools/snvPipeline/createErrorPlots.py +++ b/resources/analysisTools/snvPipeline/createErrorPlots.py @@ -200,7 +200,7 @@ def calculateErrorMatrix(vcfFilename, referenceFilename, errorType): if ',' in split_line[header.index("ALT")]: continue # 21.02.2023 NP: Excluded SNVs with 'N' before or after "," in context - if 'N,' in split_line[header.index("SEQUENCE_CONTEXT")] or ',N' in split_line[header.index("SEQUENCE_CONTEXT")]: + if {'N,', ',N'}.intersection(split_line[header.index("SEQUENCE_CONTEXT")]): continue chrom = split_line[header.index("CHROM")] diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh index 2ae1534..fa23694 100755 --- a/resources/analysisTools/snvPipeline/filter_vcf.sh +++ b/resources/analysisTools/snvPipeline/filter_vcf.sh @@ -24,7 +24,7 @@ getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFI ALIGNMENT_FOLDER=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}` # bugfix: ensure to interpret CHROMOSOME_INDICES as array - otherwise TOOL_INTERMUTATION_DISTANCE_COORD_COLOR will fail... -declare -a CHROMOSOME_INDICES="${CHROMOSOME_INDICES_PLOTTING}" +declare -a CHROMOSOME_INDICES="${CHROMOSOME_INDICES_PLOTTING:-${CHROMOSOME_INDICES}}" numberOfChromosomes=${CHROMOSOME_INDICES[@]} outputFilenamePrefix=${mpileupDirectory}/${SNVFILE_PREFIX}${PID} outputFilenamePrefix_original=${outputFilenamePrefix} diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml index 4cdb729..96d2dcf 100644 --- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml +++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml @@ -9,31 +9,26 @@ - - + - - + - - - - + + + - - - - + + + - - - + + @@ -53,7 +48,6 @@ - From 3a47349b601ab9b51f5f821fd4e7829e032b86f7 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 12 Jun 2023 12:55:00 +0200 Subject: [PATCH 47/49] Minor update --- README.md | 2 +- .../dkfz_lsf_virtual_snv_env_packages.txt | 19 ------------------- .../configurationFiles/analysisSNVCalling.xml | 2 +- 3 files changed, 2 insertions(+), 21 deletions(-) delete mode 100644 resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt diff --git a/README.md b/README.md index 0fba5fa..a1b5301 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # DKFZ SNVCalling Workflow -An SNV calling workflow developed in the Applied Bioinformatics and Theoretical Bioinformatics groups at the DKFZ. An earlier version (pre Github) of this workflow was used in the [Pancancer](https://dockstore.org/containers/quay.io/pancancer/pcawg-dkfz-workflow) project. The workflow is only suited for human data (hg37/hg19; some later versions also hg38), because of the big role annotations play in this workflow. +An SNV calling workflow developed in the Applied Bioinformatics and Theoretical Bioinformatics groups at the DKFZ. An earlier version (pre Github) of this workflow was used in the [Pancancer](https://dockstore.org/containers/quay.io/pancancer/pcawg-dkfz-workflow) project. The workflow is only suited for human data (hg37/GRCh37/hg19; and also for hg38/GRCh38), because of the big role annotations play in this workflow. >
de.NBI logoYour opinion matters! The development of this workflow is supported by the German Network for Bioinformatic Infrastructure (de.NBI). By completing this very short (30-60 seconds) survey you support our efforts to improve this tool.
diff --git a/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt b/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt deleted file mode 100644 index 2399198..0000000 --- a/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt +++ /dev/null @@ -1,19 +0,0 @@ -Package Version ---------------- -------- -biopython 1.57 -Cython 0.27.3 -funcsigs 1.0.2 -matplotlib 1.4.3 -mock 2.0.0 -nose 1.3.7 -numpy 1.11.3 -pbr 3.1.1 -pip 20.3.4 -pyparsing 2.2.0 -pysam 0.16.0.1 -python-dateutil 2.6.1 -pytz 2017.3 -scipy 0.12.0 -setuptools 36.8.0 -six 1.11.0 -wheel 0.30.0 diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index e620f49..c337644 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -11,7 +11,7 @@ - From b8d46649c39587b005d6b76bda3bd061ea4e4f89 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 12 Jun 2023 13:50:29 +0200 Subject: [PATCH 48/49] Update readme with hg38 calls --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index a1b5301..a94901d 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,8 @@ Please have a look at the `resources/configurationFiles/analysisSNVCalling.xml` ## Example Call +For hg19, + ```bash roddy.sh run projectConfigurationName@analysisName patientId \ --useconfig=/path/to/your/applicationProperties.ini \ @@ -123,6 +125,22 @@ roddy.sh run projectConfigurationName@analysisName patientId \ --cvalues="bamfile_list:/path/to/your/control.bam;/path/to/your/tumor.bam,sample_list:normal;tumor,possibleTumorSampleNamePrefixes:tumor,possibleControlSampleNamePrefixes:normal,REFERENCE_GENOME:/reference/data/hs37d5_PhiX.fa,CHROMOSOME_LENGTH_FILE:/reference/data/hs37d5_PhiX.chromSizes,extractSamplesFromOutputFiles:false" ``` +For hg38, add the following workflow configs to the `projectConfigs` + +```xml + + + +``` + +```bash +roddy.sh run projectConfigurationName@analysisName patientId \ + --useconfig=/path/to/your/applicationProperties.ini \ + --configurationDirectories=/path/to/your/projectConfigs \ + --useiodir=/input/directory,/output/directory/snv \ + --cvalues="bamfile_list:/path/to/your/control.bam;/path/to/your/tumor.bam,sample_list:normal;tumor,possibleTumorSampleNamePrefixes:tumor,possibleControlSampleNamePrefixes:normal,REFERENCE_GENOME:/reference/data/GRCh38_decoy_ALT_HLA_PhiX.fa,CHROMOSOME_LENGTH_FILE:/reference/data/GRCh38_decoy_ALT_HLA_PhiX.chromSizes,extractSamplesFromOutputFiles:false" +``` + ### No Control * Set the parameter `isNoControlWorkflow` to `true`. From 4dca6367a9dfa87ec9d1f464037d353b93b1d760 Mon Sep 17 00:00:00 2001 From: paramasi Date: Tue, 27 Jun 2023 16:33:54 +0200 Subject: [PATCH 49/49] Fixing the missed conflict --- .../snvPipeline/filter_PEoverlap.py | 476 ------------------ 1 file changed, 476 deletions(-) diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py index 14e016a..6d19fcf 100755 --- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py +++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py @@ -1,4 +1,3 @@ -<<<<<<< HEAD #!/usr/bin/env python # # Copyright (c) 2018 German Cancer Research Center (DKFZ). @@ -477,478 +476,3 @@ def performAnalysis(args): performAnalysis(args) #print "\nProgram successfully terminating...." -======= -#!/usr/bin/env python -# -# Copyright (c) 2018 German Cancer Research Center (DKFZ). -# -# Distributed under the MIT License (https://opensource.org/licenses/MIT). -# - - -# python /home/jaegern/pyWorkspace/NGS_Read_Processing/src/filter_PEoverlap.py --inf=snvs_108031.vcf --alignmentFile=/icgc/lsdf/mb/analysis/medullo/adultMB/results_per_pid/108031/alignment/tumor_108031_merged.bam.rmdup.bam --outf=snvs_108031_PEoverlapFiltered.vcf -# more snvs_108031.vcf | python /home/jaegern/pyWorkspace/NGS_Read_Processing/src/filter_PEoverlap.py --alignmentFile=/icgc/lsdf/mb/analysis/medullo/adultMB/results_per_pid/108031/alignment/tumor_108031_merged.bam.rmdup.bam --outf=snvs_108031_PEoverlapFiltered_nonALT_FINAL.vcf - - -import platform -if platform.python_implementation() == "PyPy": - import copysam as pysam -else: # "CPython" - import pysam - -import sys, os -from vcfparser import * -import re - -def listToTabsep(listItems, sep='\t'): - return sep.join(listItems) - -def qualFromASCII(ch): - return(ord(ch) - qualScoreOffset) - - -def transformQualStr(s): - return map(qualFromASCII,s) - -# Routine for getting index of ACGTNacgtn lists -def getIndexACGTNacgtn(is_reverse, is_read1, base): - if (is_reverse): - if(is_read1): - if(base == "a"): - return ["minus", 5] - elif(base == "c"): - return ["minus", 6] - elif(base == "g"): - return ["minus", 7] - elif(base == "t"): - return ["minus", 8] - elif(base == "n"): - return ["minus", 9] - else: - if(base == "a"): - return ["plus", 5] - elif(base == "c"): - return ["plus", 6] - elif(base == "g"): - return ["plus", 7] - elif(base == "t"): - return ["plus", 8] - elif(base == "n"): - return ["plus", 9] - else: - if(is_read1): - if(base == "a"): - return ["plus", 0] - elif(base == "c"): - return ["plus", 1] - elif(base == "g"): - return ["plus", 2] - elif(base == "t"): - return ["plus", 3] - elif(base == "n"): - return ["plus", 4] - else: - if(base == "a"): - return ["minus", 0] - elif(base == "c"): - return ["minus", 1] - elif(base == "g"): - return ["minus", 2] - elif(base == "t"): - return ["minus", 3] - elif(base == "n"): - return ["minus", 4] - -def decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar): - if remove_base.lower() == REF.lower(): - if remove_is_reverse: - # check if none of the 4 DP4 values are < 0 now, which can happen due to BAQ values instead of original base qualities, which are not part of the BAM file - if DP4rr > 0: DP4rr -= 1 - else: - if DP4rf > 0: DP4rf -= 1 - elif remove_base.lower() == ALT.lower(): - if remove_is_reverse: - if DP4ar > 0: DP4ar -= 1 - else: - if DP4af > 0: DP4af -= 1 - return(DP4rf, DP4rr, DP4af, DP4ar) - -class BoolCounter: - """ A class for counter objects """ - - def __init__(self, ref_base, alt_base): - self.ref_base = ref_base - self.alt_base = alt_base - self.ref_count = 0 - self.alt_count = 0 - - def update(self, current_base): - if self.ref_base == current_base: - self.ref_count += 1 - elif self.alt_base == current_base: - self.alt_count += 1 - - def update_nonREFnonALT(self, count): - self.alt_count =+ count - - def counted(self): - return [self.ref_count, self.alt_count] - -# MAIN ANALYSIS PROCEDURE -def performAnalysis(args): - global qualScoreOffset - - # http://stackoverflow.com/questions/881696/unbuffered-stdout-in-python-as-in-python-u-from-within-the-program - unbuffered = os.fdopen(sys.stdout.fileno(), 'w', 0) - sys.stdout = unbuffered - - if args.qualityScore == 'illumina': qualScoreOffset = 64 - elif args.qualityScore == 'phred': qualScoreOffset = 33 - - #vcfInFile = open(args.inf, "r") - #outFile = open(args.outf, "w") - - # Reference file for BAQ_recalcuation and local realignment - reference_file = pysam.Fastafile(args.refFileName) - - mode = "r" - multiple_iterators = False - # Setting pysam read mode based on the file extension - if args.alignmentFile.split(".")[-1] == "bam": - mode += "b" - elif args.alignmentFile.split(".")[-1] == "cram": - mode += "c" - samfile = pysam.Samfile(args.alignmentFile, mode) # This should work for BAM file only (with random access). - - if args.altPosF != '': - ALT_basePositions_file = args.altPosF - - if args.refPosF != '': - REF_basePositions_file = args.refPosF - - if args.altBQF != '': - ALT_baseQualities_file = args.altBQF - - if args.refBQF != '': - REF_baseQualities_file = args.refBQF - - - for line in sys.stdin: # vcfInFile - if line[0] == "#": - headers = list(line[1:].rstrip().split('\t')) - fixed_headers = ["^CHROM$", "^POS$", "^REF$", "^ALT$", "^INFO$", ] - if args.no_control: - fixed_headers += ["^CONFIDENCE$", "^RECLASSIFICATION$", ] - else: - fixed_headers += ["^ANNOTATION_control", ] - header_indices = get_header_indices(headers, args.configfile, fixed_headers) - sys.stdout.write(line) - continue # skip header from analysis - - entries = line.strip().split('\t') - parsed_line = LineParser(entries, header_indices) - - nonREFnonALTfwd=0 - nonREFnonALTrev=0 - ALTcount=0 - - ALT_basePositions=[] - REF_basePositions=[] - REF_baseQualities=[] - ALT_baseQualities=[] - - # how to treat multiallelic SNVs? Skipped in this current version... - if ((args.no_control and int(parsed_line["CONFIDENCE"]) > 7 and "somatic" in parsed_line["RECLASSIFICATION"]) or (not args.no_control and "somatic" in parsed_line["ANNOTATION_control"])) and len(parsed_line["ALT"]) == 1: - # DP=13;AF1=0.5;AC1=1;DP4=2,3,3,4;MQ=37;FQ=75;PV4=1,1,1,1 - info_values = parsed_line["INFO"].split(';') - for info_idx, info_value in enumerate(info_values): - if info_value[:4] == "DP4=": - DP4_idx = info_idx - DP4 = map(int, info_value[4:].split(',')) - DP4rf, DP4rr, DP4af, DP4ar = DP4 - DP4_original = re.sub('DP4', 'DP4original', info_value) # Keeping a backup of original DP4 - DP4_original_alt = DP4af + DP4ar - break - - chrom=parsed_line["CHROM"] - pos=int(parsed_line["POS"]) - REF=parsed_line["REF"] - ALT=parsed_line["ALT"] - - readNameHash={} - readMateHash={} # Hash to store read and mate starting positions for duplicate marking - readMateHash_qnameLocation={} # Hash to store the location of gname in the above hash list - - - ACGTNacgtn1 = [0]*10 - ACGTNacgtn2 = [0]*10 - count_PE = BoolCounter(REF, ALT) # Starting the counter for the forward and reverse reads removed due to PE overlap detection - count_supple = BoolCounter(REF, ALT) # "" for supplementary reads, since flag_filter is added, entire supplementary detection can be removed in future versions - count_mismatch = BoolCounter(REF, ALT) # " for mismatch report - count_nonREFnonALT = BoolCounter(REF, ALT) # " to count the non-ref and non-alt base at POS - - # To match pysam and mpileup counts, a reference file is added. Given the reference file, Pysam by default computes BAQ (compute_baq). - for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=3844, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators): - if pileupcolumn.pos == (pos-1): - #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments) - for pileupread in pileupcolumn.pileups: - if pileupread.is_del: - # 31 May 2016 JB: deletion at the pileup position - continue - baseScore = transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] - readpos = pileupread.query_position - if pileupread.alignment.seq[pileupread.query_position].lower() == ALT.lower(): - if args.altBQF != '': - ALT_baseQualities.append(baseScore) - if args.altPosF != '': - if pileupread.alignment.is_reverse: - readlength = len(pileupread.alignment.seq) - readpos = (readlength - readpos) - ALT_basePositions.append(readpos) - if pileupread.alignment.seq[pileupread.query_position].lower() == REF.lower(): - if args.refBQF != '': - REF_baseQualities.append(baseScore) - if args.refPosF != '': - if pileupread.alignment.is_reverse: - readlength = len(pileupread.alignment.seq) - readpos = (readlength - readpos) - REF_basePositions.append(readpos) - - if pileupread.alignment.mapq >= args.mapq: - # http://wwwfgu.anat.ox.ac.uk/~andreas/documentation/samtools/api.html USE qqual - try: - if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: - # check if we consider this read as a proper read in terms of number of mismatches - if args.allowedNumberOfMismatches > -1: - numberOfMismatches = None - for tag in pileupread.alignment.tags: - if tag[0] == "NM": - numberOfMismatches = tag[1] - break - else: - continue - - if numberOfMismatches is not None: - if numberOfMismatches > args.allowedNumberOfMismatches: - remove_base = pileupread.alignment.seq[pileupread.query_position] - remove_is_reverse = pileupread.alignment.is_reverse - count_mismatch.update(remove_base) - (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) - # after decreasing the respective DP4 value, go directly to the next read - # without remembering the current read - # This will lead to an unknown read name when the paired read occurs at the same - # position. As we have already discarded the current high-mismatch read, we do not - # have to decrease DP4 values again, when the read partner occurs at the same SNV. - # We also do not increase ANCGTNacgtn for the discarded read. - continue - - # Check if pileupread.alignment is proper pair - if(pileupread.alignment.is_proper_pair): - # count to ACGTNacgtn list - is_reverse = pileupread.alignment.is_reverse - is_read1 = pileupread.alignment.is_read1 - base = pileupread.alignment.seq[pileupread.query_position].lower() - ACGTNacgtn_index = getIndexACGTNacgtn(is_reverse, is_read1, base) - if(ACGTNacgtn_index[0] == "plus"): - ACGTNacgtn1[ACGTNacgtn_index[1]] += 1 - else: - ACGTNacgtn2[ACGTNacgtn_index[1]] += 1 - - #if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: # DEBUG July 23 2012: BROAD BAM problem due to pileupread.alignment.qqual being shorter sometimes than pileupread.alignment.qual - if(pileupread.alignment.query_name in readNameHash): - #print pileupread.alignment.query_name - old_qual = readNameHash[pileupread.alignment.query_name][0] - old_base = readNameHash[pileupread.alignment.query_name][1] - old_is_reverse = readNameHash[pileupread.alignment.query_name][2] - old_read_mate_tuple = readNameHash[pileupread.alignment.query_name][3] - current_qual = transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] - current_base = pileupread.alignment.seq[pileupread.query_position] - current_is_reverse = pileupread.alignment.is_reverse - current_read_mate_tuple = (pileupread.alignment.reference_id, pileupread.alignment.reference_start, pileupread.alignment.reference_end, pileupread.alignment.next_reference_id, pileupread.alignment.next_reference_start) - # if read name occurs twice for one variant, then due to overlapping PE reads, then subtract variant count from DP4 field - # if old_base is not equal to new_base remove the one with the smaller base quality - remove_base = None - remove_is_reverse = None - if(not(old_base == current_base)): - if(old_qual <= current_qual): - remove_base = old_base - remove_is_reverse = old_is_reverse - remove_old = True - else: - remove_base = current_base - remove_is_reverse = current_is_reverse - remove_old = False - else: - remove_base = current_base - remove_is_reverse = current_is_reverse - remove_old = False - - count_PE.update(remove_base) - (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) - # If current base is better, then removing the information about old mate - # If current base is not good, then do nothing - if remove_old: - old_location = readMateHash_qnameLocation[pileupread.alignment.query_name] - del readMateHash[old_read_mate_tuple][old_location] - read_mate_tuple_value = (current_qual, (current_base, current_is_reverse, pileupread.alignment.query_name, pileupread.alignment.is_supplementary)) - if current_read_mate_tuple in readMateHash: - readMateHash[current_read_mate_tuple].append(read_mate_tuple_value) - else: - readMateHash[current_read_mate_tuple] = [] - readMateHash[current_read_mate_tuple].append(read_mate_tuple_value) - - else: - # Store base quality, base, and read direction in readNameHash - base_qual_score=transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] - read_mate_tuple = (pileupread.alignment.reference_id, pileupread.alignment.reference_start, pileupread.alignment.reference_end, pileupread.alignment.next_reference_id, pileupread.alignment.next_reference_start) - read_mate_tuple_value = (base_qual_score, (pileupread.alignment.seq[pileupread.query_position], pileupread.alignment.is_reverse, pileupread.alignment.query_name, pileupread.alignment.is_supplementary)) - - readNameHash[pileupread.alignment.query_name] = [base_qual_score, pileupread.alignment.seq[pileupread.query_position], pileupread.alignment.is_reverse, read_mate_tuple] - - if read_mate_tuple in readMateHash: - readMateHash[read_mate_tuple].append(read_mate_tuple_value) - else: - readMateHash[read_mate_tuple] = [] - readMateHash[read_mate_tuple].append(read_mate_tuple_value) - - readMateHash_qnameLocation[pileupread.alignment.query_name] = len(readMateHash[read_mate_tuple]) - 1 # Location of the last pushed element in the array - - except IndexError: - "soft-clipped or trimmed base, not part of the high-qual alignemnt anyways, skip" - - if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: - - if pileupread.alignment.seq[pileupread.query_position] == ALT: - ALTcount += 1 - # samtools mpileup sometimes counts bases as variants which are neither REF nor ALT - if (pileupread.alignment.seq[pileupread.query_position] != REF) and (pileupread.alignment.seq[pileupread.query_position] != ALT): - if pileupread.alignment.is_reverse: - nonREFnonALTrev += 1 - #if DP4ar > 0: DP4ar -= 1 - else: - nonREFnonALTfwd += 1 - #if DP4af > 0: DP4af -= 1 - - if (len(REF_baseQualities) > 0): - VAF = 1.0 * len(ALT_baseQualities) / (len(REF_baseQualities) + len(ALT_baseQualities)) - elif (len(ALT_baseQualities) > 0): - VAF = 1.0 - else: - VAF = 0.0 - - if args.altBQF != '': - scoreString = ",".join([str(score) for score in ALT_baseQualities]) - if scoreString != '': - scoreString = ";".join([scoreString , str(VAF)]) - ALT_baseQualities_file.write("%s\t%s\t%s\n" % (chrom, pos, scoreString)) - - if args.refBQF != '': - scoreString = ",".join([str(score) for score in REF_baseQualities]) - if scoreString != '': - scoreString = ";".join([scoreString , str(VAF)]) - REF_baseQualities_file.write("%s\t%s\t%s\n" % (chrom, pos, scoreString)) - - if args.altPosF != '': - positionsString = ",".join([str(readpos) for readpos in ALT_basePositions]) - if positionsString != '': - ALT_basePositions_file.write("%s\t%s\t%s\n" % (chrom, pos, positionsString)) - - if args.refPosF != '': - positionsString = ",".join([str(readpos) for readpos in REF_basePositions]) - if positionsString != '': - REF_basePositions_file.write("%s\t%s\t%s\n" % (chrom, pos, positionsString)) - - break # only one pileup for a position - - # Calculating duplicates based on read-mate pair's start positions (chr id and start location) - count_duplicate = BoolCounter(REF, ALT) - - for key in readMateHash: - value_length = len(readMateHash[key]) - if value_length > 0: - sorted_values = sorted(readMateHash[key], key=lambda x: x[0]) # Sorted based on base quality - sorted_values = sorted_values[:-1] # removing the read with highest quality, so it will be retained for count - for value in sorted_values: # Removing everthing else - qual_value, decreaseInfo = value - remove_base = decreaseInfo[0] - remove_is_reverse = decreaseInfo[1] - - count_duplicate.update(remove_base) - (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) - - if (DP4[2] + DP4[3]) > ALTcount: # that the ALTcount is larger happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling - if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd - - if DP4ar >= nonREFnonALTrev: DP4ar -= nonREFnonALTrev - - nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd - count_nonREFnonALT.update_nonREFnonALT(nonREFnonALT) - - #Format: DP2 -> "reference(forward + reverse), alt(forward + reverse)" - supple_dup_str = 'DP2sup=' + ','.join(map(str, count_supple.counted())) - supple_dup_str += ';DP2dup=' + ','.join(map(str, count_duplicate.counted())) - supple_dup_str += ';DP2pairEnd=' + ','.join(map(str, count_PE.counted())) - supple_dup_str += ';DP2mis=' + ','.join(map(str, count_mismatch.counted())) - supple_dup_str += ';DP2nonREFnonALT=' + ','.join(map(str, count_nonREFnonALT.counted())) - - ACGTNacgtn1_string = "ACGTNacgtnPLUS="+",".join([str(i) for i in ACGTNacgtn1]) - ACGTNacgtn2_string = "ACGTNacgtnMINUS="+",".join([str(i) for i in ACGTNacgtn2]) - - info_values[DP4_idx] = "DP4=" + str(DP4rf)+ "," + str(DP4rr)+ "," + str(DP4af)+ "," + str(DP4ar) - info_values.append(ACGTNacgtn1_string) - info_values.append(ACGTNacgtn2_string) - info_values.append(DP4_original) - info_values.append(supple_dup_str) - - entries[header_indices["INFO"]] = ';'.join(info_values) - - sys.stdout.write('\t'.join(entries) + '\n') - else: - sys.stdout.write(line) # write germline and somatic-multiallelic SNVs as is - - del ALT_basePositions - del REF_basePositions - del REF_baseQualities - del ALT_baseQualities - samfile.close() - - if args.altBQF is not None: - ALT_baseQualities_file.close() - - if args.refBQF is not None: - REF_baseQualities_file.close() - - if args.altPosF is not None: - ALT_basePositions_file.close() - - if args.refPosF is not None: - REF_basePositions_file.close() - - #vcfInFile.close() - #outFile.close() - -if __name__ == '__main__': - #print "Starting program...\n" - import argparse - parser = argparse.ArgumentParser() - #parser.add_option('--inf',action='store',type='string',dest='inf',help='Specify the name of the input vcf file containing all snvs (germline and somatic)',default='') - parser.add_argument('--alignmentFile',dest='alignmentFile',help='Specify the name of the BAM file containing bwa alignments, has to be the BAM file that was used to call the variants in the input vcf file - REQUIRED', required=True) - #parser.add_option('--outf',action='store',type='string',dest='outf',help='Specify the name of the output file, which will have same format as input vcf but with PE overlap filtered DP4 values if somatic and if snvs in PE overlap region',default='') - parser.add_argument('--referenceFile', dest='refFileName', help='Specify the full path of reference genome file in fasta format, with index in the same directory') - parser.add_argument('--mapq',type=int,dest='mapq',help='Specify the minimum mapping quality of bwa used for mpileup as parameter -q (default: 30 )',default=30) - parser.add_argument('--baseq',type=int,dest='baseq',help='Specify the minimum base quality scores used for mpileup as parameter -Q (default: 13)',default=13) - parser.add_argument('--qualityScore',dest='qualityScore',help='Specify whether the per base quality score is given in phred or illumina format (default is Illumina score: ASCII offset of 64, while PHRED scores have an ASCII offset of 33)',default='phred') - parser.add_argument('--maxNumberOfMismatchesInRead',type=int,dest='allowedNumberOfMismatches',help='Specify the number of mismatches that are allowed per read in order to consider this read. Value of -1 (default) turns this filter off.',default=-1) - parser.add_argument('--altBaseQualFile',nargs="?",type=argparse.FileType('w'),dest='altBQF',help='Specify the name of the output file for alternative allele base qualities.',default=None) - parser.add_argument('--refBaseQualFile',nargs="?",type=argparse.FileType('w'),dest='refBQF',help='Specify the name of the output file for reference allele base qualities.',default=None) - parser.add_argument('--altBasePositionsFile',nargs="?",type=argparse.FileType('w'),dest='altPosF',help='Specify the name of the output file for position within the reads for alternative bases.',default=None) - parser.add_argument('--refBasePositionsFile',nargs="?",type=argparse.FileType('w'),dest='refPosF',help='Specify the name of the output file for position within the reads for reference bases.',default=None) - parser.add_argument('--nocontrol',action='store_true',dest='no_control',help='Specify the workflow is run without control.',default=False) - parser.add_argument('--configFile',nargs="?",type=argparse.FileType('r'),dest='configfile',help='Specify the config file which contains header names.',default=None) - - - args = parser.parse_args() - performAnalysis(args) - #print "\nProgram successfully terminating...." - ->>>>>>> master