From ba8b0f5e5ed316f52205ac03cfb12a4a0541f1a0 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Mon, 19 Aug 2019 09:38:47 +0200
Subject: [PATCH 01/49] Removing ExAC and EVS from annotation and no-control
 filtering

---
 .../snvPipeline/confidenceAnnotation_SNVs.py      | 15 +--------------
 .../analysisTools/snvPipeline/snvAnnotation.sh    |  4 +---
 .../configurationFiles/analysisSNVCalling.xml     |  8 --------
 3 files changed, 2 insertions(+), 25 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index f13d346..df0e0fa 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -105,8 +105,6 @@ def main(args):
             variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", }
 
             if args.no_control:
-                variable_headers["ExAC_COL"] = "^ExAC$"
-                variable_headers["EVS_COL"] = "^EVS$"
                 variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$"
                 variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$"
                 variable_headers["LOCALCONTROL_COL"] = "^LocalControlAF$"
@@ -187,8 +185,6 @@ def main(args):
         is_weird = False # coindicende with known artefact-producing regions
         if args.no_control:
             classification = "somatic" # start with default somatic
-            inExAC = False
-            inEVS = False
             inGnomAD_WES = False
             inGnomAD_WGS = False
             inLocalControl = False
@@ -234,15 +230,6 @@ def main(args):
         if args.no_control:
             if indbSNP and is_commonSNP and not is_clinic:
                 reasons += "dbSNP(NoControl)"
-            if help["ExAC_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["ExAC_COL"], "AF").split(','))):
-                inExAC = True
-                infofield["ExAC"] = "ExAC"
-                reasons += "ExAC(NoControl)"
-            if help["EVS_COL_VALID"] and any(af > 1.0 for af in map(float, extract_info(help["EVS_COL"], "MAF").split(','))):
-                inEVS = True
-                infofield["EVS"] = "EVS"
-                reasons += "EVS(NoControl)"
-
             if help["GNOMAD_EXOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))):
                 inGnomAD_WES = True
                 infofield["gnomAD_Exomes"] = "gnomAD_Exomes"
@@ -397,7 +384,7 @@ def main(args):
 
         if (args.no_control):
             # an SNV that is in dbSNP but not "clinic" or/and in 1 KG with high frequency is probably germline
-            if (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inExAC or inEVS or inGnomAD_WES or inGnomAD_WGS or inLocalControl):
+            if (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl):
                classification = "SNP_support_germline"
 
         # 3) information from the calls and germline comparisons: coverage, strand bias, variant support, ..
diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh
index 7d62a8e..d3c97f5 100755
--- a/resources/analysisTools/snvPipeline/snvAnnotation.sh
+++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh
@@ -90,9 +90,7 @@ if [[ ${isNoControlWorkflow-false} == "false" ]]; then
 fi
 
 cmdFilter="${cmdFilter} | ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${DBSNP} --columnName=${DBSNP_COL} --reportMatchType  --bAdditionalColumn=2 --reportLevel 4 | \
-    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${KGENOME} --columnName=${KGENOMES_COL}  --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \
-    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${ExAC} --columnName ${ExAC_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \
-    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${EVS} --columnName ${EVS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \
+    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${KGENOME} --columnName=${KGENOMES_COL}  --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \    
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_ALL_SNV} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_ALL_SNV} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL} --columnName ${LOCALCONTROL_COL} --bFileType vcf --reportMatchType --reportLevel 4 "
diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
index 30a9dc3..e55d168 100755
--- a/resources/configurationFiles/analysisSNVCalling.xml
+++ b/resources/configurationFiles/analysisSNVCalling.xml
@@ -47,10 +47,6 @@
         <cvalue name="DBSNP_COL" value="DBSNP" type="string"/>
         <cvalue name="KGENOME" value="${hg19DatabasesDirectory}/1000genomes/ALL.wgs.phase1_integrated_calls.20101123.snps_chr.vcf.gz" type="path"/>
         <cvalue name="KGENOMES_COL" value="1K_GENOMES" type="string"/>
-        <cvalue name="ExAC" value="/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/ExAC/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" type="path" />
-        <cvalue name="ExAC_COL" value="ExAC" type="string"/>
-        <cvalue name="EVS" value="${hg19DatabasesDirectory}/EVS/ESP6500SI-V2-SSA137.updatedProteinHgvs.ALL.snps_indels.vcf.gz" type="path" />
-        <cvalue name="EVS_COL" value="EVS" type="string"/>
         <cvalue name="GNOMAD_WES_COL" value="GNOMAD_EXOMES" type="string"/>
         <cvalue name="GNOMAD_WES_ALL_SNV" value= "${hg19DatabasesDirectory}/gnomAD/gnomad.exomes.r2.0.1.sites.SNVs.vcf.gz" type="path"/>
         <cvalue name="GNOMAD_WGS_COL" value="GNOMAD_GENOMES" type="string"/>
@@ -76,8 +72,6 @@
         <!--#### EXTRA FILTERING -->
         <!--#### BELOW VALUES WILL BE NEGLECTED WHEN isNoControlWorkflow IS SET TO FALSE -->
             <!--## Filter switch -->
-            <cvalue name="FILTER_ExAC" value="${isNoControlWorkflow}" type="boolean" />
-            <cvalue name="FILTER_EVS" value="${isNoControlWorkflow}" type="boolean" />
             <cvalue name="FILTER_GNOMAD_EXOMES" value="${isNoControlWorkflow}" type="boolean"/>
             <cvalue name="FILTER_GNOMAD_GENOMES" value="${isNoControlWorkflow}" type="boolean"/>
             <cvalue name="FILTER_1KGENOMES" value="${isNoControlWorkflow}" type="boolean" />
@@ -87,8 +81,6 @@
 
             <!--## Filter criteria -->
             <!--## The rows with corresponding values larger than below criteria will be filtered out. -->
-            <cvalue name="CRIT_ExAC_maxMAF" value="1.0" type="string" />
-            <cvalue name="CRIT_EVS_maxMAF" value="1.0" type="string" />
             <cvalue name="CRIT_GNOMAD_EXOMES_maxMAF" value="0.001" type="float"/>
             <cvalue name="CRIT_GNOMAD_GENOMES_maxMAF" value="0.001" type="float"/>
             <cvalue name="CRIT_1KGENOMES_maxMAF" value="0.01" type="string" />

From 8584941dc1a61cc7c45f87ca847b97a8e9b4fc13 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Fri, 11 Oct 2019 12:05:56 +0200
Subject: [PATCH 02/49] Removing DUKE, DAC and HiSeqDepth based confidence
 annotaions

---
 .../snvPipeline/confidenceAnnotation_SNVs.py  | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index df0e0fa..0e2aa0b 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -54,7 +54,7 @@ def main(args):
                   '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth at this position in the sample">\n' \
                   '##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward, ref-reverse, alt-forward and alt-reverse bases">\n' \
                   '##FILTER=<ID=RE,Description="variant in UCSC_27Sept2013_RepeatMasker.bed.gz region and/or SimpleTandemRepeats_chr.bed.gz region, downloaded from UCSC genome browser and/or variant in segmental duplication region, annotated by annovar">\n' \
-                  '##FILTER=<ID=BL,Description="variant in DAC-Blacklist from ENCODE or in DUKE_EXCLUDED list, both downloaded from UCSC genome browser">\n' \
+                  '##FILTER=<ID=BL,Description="variant in DAC-Blacklist from ENCODE or in DUKE_EXCLUDED list, both downloaded from UCSC genome browser - # not used in hg38">\n' \
                   '##FILTER=<ID=DP,Description="<= 5 reads total at position in tumor">\n' \
                   '##FILTER=<ID=SB,Description="Strand bias of reads with mutant allele = zero reads on one strand">\n' \
                   '##FILTER=<ID=TAC,Description="less than 6 reads in Tumor at position">\n' \
@@ -98,8 +98,8 @@ def main(args):
 
         if line[0] == "#":
             headers = list(line[1:].rstrip().split('\t'))
-            fixed_headers = ["^INFO$", "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", "DUKE_EXCLUDED",
-                             "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$",
+            fixed_headers = ["^INFO$", "MAPABILITY", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", 
+                             "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$",
                              "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$",
                              "^seqBiasPresent_2$", "^seqingBiasPresent_2$"]
             variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", }
@@ -321,19 +321,19 @@ def main(args):
             filterfield["RE"] = 1
 
         # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat
-        if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]:
-            confidence -= 3 # really bad region, usually centromeric repeats
-            is_weird = True
-            reasons += "Blacklist(-3)"
-            filterfield["BL"] = 1
+        #if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]:
+        #    confidence -= 3 # really bad region, usually centromeric repeats
+        #    is_weird = True
+        #    reasons += "Blacklist(-3)"
+        #    filterfield["BL"] = 1
 
         # HiSeqDepth: regions "attracting" reads; often coincide with tandem repeats and CEN/TEL,
         # not always with low mapability
-        if help["HISEQDEPTH_VALID"]:
-            confidence -= 3 # really really bad region!
-            is_weird = True
-            reasons += "Hiseqdepth(-3)"
-            filterfield["HSDEPTH"] = 1
+        #if help["HISEQDEPTH_VALID"]:
+        #    confidence -= 3 # really really bad region!
+        #    is_weird = True
+        #    reasons += "Hiseqdepth(-3)"
+        #    filterfield["HSDEPTH"] = 1
 
         # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ...
         # Everything with really high number of occurences is artefacts

From 7e49b5d9fa45165d440f84cb222097e43515867b Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Fri, 11 Oct 2019 12:07:50 +0200
Subject: [PATCH 03/49] Removing getRefGenomeAndChrPrefixFromHeader functions

---
 resources/analysisTools/snvPipeline/filter_vcf.sh    | 4 ++--
 resources/analysisTools/snvPipeline/snvAnnotation.sh | 6 +++---
 resources/analysisTools/snvPipeline/snvCalling.sh    | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh
index dc71e1e..1d57f0e 100755
--- a/resources/analysisTools/snvPipeline/filter_vcf.sh
+++ b/resources/analysisTools/snvPipeline/filter_vcf.sh
@@ -19,8 +19,8 @@ mpileupDirectory=`dirname ${FILENAME_VCF}`
 RUN_PLOTS=${RUN_PLOTS-1}
 RUN_PUREST=${RUN_PUREST-1}
 
-source ${TOOL_ANALYZE_BAM_HEADER}
-getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
+#source ${TOOL_ANALYZE_BAM_HEADER}
+#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
 ALIGNMENT_FOLDER=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}`
 
 # bugfix: ensure to interpret CHROMOSOME_INDICES as array - otherwise TOOL_INTERMUTATION_DISTANCE_COORD_COLOR will fail...
diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh
index d3c97f5..5a121bd 100755
--- a/resources/analysisTools/snvPipeline/snvAnnotation.sh
+++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh
@@ -36,8 +36,8 @@ scriptDirectory=`dirname ${WRAPPED_SCRIPT}`
 #TOOLS_DIR=${BASEPATH_TOOLS}
 FILEBASE=`dirname ${FILENAME_VCF_IN}`/`basename ${FILENAME_VCF_IN} .vcf`
 
-source ${TOOL_ANALYZE_BAM_HEADER}
-getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
+#source ${TOOL_ANALYZE_BAM_HEADER}
+#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
 
 
 [[ -f ${FILENAME_CHECKPOINT} ]] && rm ${FILENAME_CHECKPOINT}
@@ -90,7 +90,7 @@ if [[ ${isNoControlWorkflow-false} == "false" ]]; then
 fi
 
 cmdFilter="${cmdFilter} | ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${DBSNP} --columnName=${DBSNP_COL} --reportMatchType  --bAdditionalColumn=2 --reportLevel 4 | \
-    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${KGENOME} --columnName=${KGENOMES_COL}  --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \    
+    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${KGENOME} --columnName=${KGENOMES_COL}  --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_ALL_SNV} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_ALL_SNV} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL} --columnName ${LOCALCONTROL_COL} --bFileType vcf --reportMatchType --reportLevel 4 "
diff --git a/resources/analysisTools/snvPipeline/snvCalling.sh b/resources/analysisTools/snvPipeline/snvCalling.sh
index de88566..92f1113 100755
--- a/resources/analysisTools/snvPipeline/snvCalling.sh
+++ b/resources/analysisTools/snvPipeline/snvCalling.sh
@@ -24,8 +24,8 @@ fi
 PARM_CHR_INDEX=${PARM_CHR_INDEX-}
 
 # Ignore chr prefix and set it manually
-source ${TOOL_ANALYZE_BAM_HEADER}
-getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
+#source ${TOOL_ANALYZE_BAM_HEADER}
+#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
 
 chr=""
 if [[ -n "$PARM_CHR_INDEX" ]]

From ce97964057460c23f13dee706472a95cbdb15799 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Fri, 11 Oct 2019 12:08:56 +0200
Subject: [PATCH 04/49] Updating annotation file paths to hg38 directory

---
 .../configurationFiles/analysisSNVCalling.xml | 81 ++++++++++---------
 1 file changed, 43 insertions(+), 38 deletions(-)

diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
index e55d168..7b1f92e 100755
--- a/resources/configurationFiles/analysisSNVCalling.xml
+++ b/resources/configurationFiles/analysisSNVCalling.xml
@@ -41,24 +41,29 @@
         <cvalue name="RAW_SNV_FILTER_OPTIONS" value='" --minVac=3 --minVaf=0.03 --minVacPS=2"' type="string"/>
         <!--<cvalue name='REFERENCE_GENOME' value="${referenceGenome_hg19_chr}" type="string"/>-->
 
+    	<!-- Reference genomes -->
+    	<cvalue name='REFERENCE_GENOME' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/GRCh38_d1_vd1_phiX.fa' type="path"/>
+    	<cvalue name='INDEX_PREFIX' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/bwa-0.7.15/GRCh38_d1_vd1_phiX.fa' type="path"/>
+    	<cvalue name='CHROM_SIZES_FILE' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv' type="path" />
+    	<cvalue name='CHROMOSOME_LENGTH_FILE' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLength.tsv' type="path" />
+
         <!--## Databases -->
         <!-- hg19DatabasesDirectory=/icgc/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases -->
-        <cvalue name="DBSNP" value="${hg19DatabasesDirectory}/dbSNP/dbSNP_147/00-All.SNV.vcf.gz" type="path"/>
+        <cvalue name="DBSNP" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
         <cvalue name="DBSNP_COL" value="DBSNP" type="string"/>
-        <cvalue name="KGENOME" value="${hg19DatabasesDirectory}/1000genomes/ALL.wgs.phase1_integrated_calls.20101123.snps_chr.vcf.gz" type="path"/>
+        <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
         <cvalue name="KGENOMES_COL" value="1K_GENOMES" type="string"/>
         <cvalue name="GNOMAD_WES_COL" value="GNOMAD_EXOMES" type="string"/>
-        <cvalue name="GNOMAD_WES_ALL_SNV" value= "${hg19DatabasesDirectory}/gnomAD/gnomad.exomes.r2.0.1.sites.SNVs.vcf.gz" type="path"/>
+        <cvalue name="GNOMAD_WES_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v2.1.1/liftover_grch38/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path"/>
         <cvalue name="GNOMAD_WGS_COL" value="GNOMAD_GENOMES" type="string"/>
-        <cvalue name="GNOMAD_WGS_ALL_SNV" value= "${hg19DatabasesDirectory}/gnomAD/gnomad.genomes.r2.0.1.sites.chr1-22-X.SNVs.All.vcf.gz" type="path"/>
-
-
-        <cvalue name="LOCALCONTROL" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/MPsnvs_PLindels/ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.SNVs.AFgt1.vcf.gz" type="path" />
+        <cvalue name="GNOMAD_WGS_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v2.1.1/liftover_grch38/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path"/>
+        <cvalue name="LOCALCONTROL" value="${hg38DatabasesDirectory}/liftover/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.SNVs.AFgt1.vcf.gz" type="path" />
         <cvalue name="LOCALCONTROL_COL" value="LocalControlAF" type="string"/>
-        <cvalue name='ANNOVAR_BINARY' value='${sharedFilesBaseDirectory}/annovar/annovar_Feb2016/annotate_variation.pl' type='path'/>
-        <cvalue name='ANNOVAR_BUILDVER' value='hg19' type='string'/>
-        <cvalue name='ANNOVAR_DBPATH' value='${sharedFilesBaseDirectory}/annovar/annovar_Feb2016/humandb/' type='path'/>
-        <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV19'" type='string'/>
+
+        <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/annotate_variation.pl' type='path'/>
+        <cvalue name='ANNOVAR_BUILDVER' value='hg38' type='string'/>
+        <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/humandb/' type='path'/>
+        <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV31'" type='string'/>
         <cvalue name="ANNOVAR_GENEANNO_COLS" value='"ANNOVAR_FUNCTION,GENE,EXONIC_CLASSIFICATION,ANNOVAR_TRANSCRIPTS"' type="string"/>
         <cvalue name="ANNOVAR_SEGDUP_COL" value="SEGDUP" type="string"/>
         <cvalue name="ANNOVAR_CYTOBAND_COL" value="CYTOBAND" type="string"/>
@@ -123,13 +128,13 @@
 
 
         <configurationValueBundle name='PIPE_CONFIG:SNV_RELIABILITY'>
-            <cvalue name='MAPABILITY' value='${hg19DatabaseUCSCDirectory}/wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz' type="path"/>
-            <cvalue name='HISEQDEPTH' value='${hg19DatabaseUCSCDirectory}/HiSeqDepthTop10Pct_chr.bed.gz' type="path"/>
-            <cvalue name='SIMPLE_TANDEMREPEATS' value='${hg19DatabaseUCSCDirectory}/repeats/SimpleTandemRepeats_chr.bed.gz:4' type="path"/>
-            <cvalue name='REPEAT_MASKER' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_RepeatMasker.bed.gz' type="path"/>
-            <cvalue name='DUKE_EXCLUDED' value='${hg19DatabaseUCSCDirectory}/DukeExcluded_chr.bed.gz' type="path"/>
-            <cvalue name='DAC_BLACKLIST' value='${hg19DatabaseUCSCDirectory}/DACBlacklist_chr.bed.gz' type="path"/>
-            <cvalue name='SELFCHAIN' value='${hg19DatabaseUCSCDirectory}/selfChain_chr.bed.gz:4::--maxNrOfMatches=5' type="path"/>
+        	<cvalue name='MAPABILITY' value='${hg38DatabasesDirectory}/mapability/GRCh38_Mappability_Align_100mer.bedGraph.gz' type="path"/>
+        	<!--cvalue name='HISEQDEPTH' value='${hg38DatabaseUCSCDirectory}/HiSeqDepthTop10Pct_chr.bed.gz' type="path"/-->
+        	<cvalue name='SIMPLE_TANDEMREPEATS' value='${hg38DatabaseUCSCDirectory}/simple_tandem_repeats/2019-08-15_SimpleTandemRepeats_Human_hg38.sorted_plain.bed.gz:4' type="path"/>
+        	<cvalue name='REPEAT_MASKER' value='${hg38DatabaseUCSCDirectory}/repeatMasker/2019-08-14_RepeatMasker_Human_hg38.sorted_plain.bed.gz' type="path"/>
+        	<!--cvalue name='DUKE_EXCLUDED' value='${hg38DatabaseUCSCDirectory}/DukeExcluded_chr.bed.gz' type="path"/-->
+        	<!--cvalue name='DAC_BLACKLIST' value='${hg38DatabaseUCSCDirectory}/DACBlacklist_chr.bed.gz' type="path"/-->
+        	<!--cvalue name='SELFCHAIN' value='${hg38DatabaseUCSCDirectory}/selfChain_chr.bed.gz:4::\-\-maxNrOfMatches=5' type="path"/-->
         </configurationValueBundle>
 
 
@@ -145,17 +150,17 @@
             # see also the usage report of ngs2/trunk/tools/annotate_vcf.pl
             # if any whitespaces are required, use quotation marks like in this example: #MAPABILITY="/ibios/co02/annotation/hg19/wgEncodeCrgMapabilityAlign100mer.bedGraph.gz:::-breakPointMode -aEndOffset=1"-->
         <configurationValueBundle name='PIPE_CONFIG:SNV_DEEPANNOTATION'>
-            <cvalue name='CpGislands' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_CpG_islands.bed.gz' type="path"/>
-            <cvalue name='CgiMountains' value='${assembliesHG191000GenomesDirectory}/CustomDeepAnnotation/CgiMountains_chr.bed.gz:4' type="path"/>
-            <cvalue name='Enhancers' value='${hg19DatabaseUCSCDirectory}/Enhancers_Vista_2011_01_14_chr.bed.gz' type="path"/>
-            <cvalue name='miRNAs_snoRNAs' value='${hg19DatabasesDirectory}/miRNA/miRNA_snoRNAs_miRBaseRelease15_chr.bed.gz' type="path"/>
-            <cvalue name='miRBase18' value='${hg19DatabasesDirectory}/miRNA/miRBase_version-18_chr.bed.gz' type="path"/>
-            <cvalue name='miRNAtargets' value='${hg19DatabasesDirectory}/miRNA/miRNAsites_TargetScan_BartelLab_2011_01_14_chr.bed.gz' type="path"/>
-            <cvalue name='phastConsElem20bp' value='${hg19DatabaseUCSCDirectory}/phastConsElem_min20bp_chr.bed.gz:4' type="path"/>
-            <cvalue name='TFBScons' value='${hg19DatabaseUCSCDirectory}/tfbsConsSites_noncoding_merged_chr.bed.gz' type="path"/>
-            <cvalue name='COSMIC' value='${hg19DatabasesDirectory}/COSMIC/Cosmic_v77_hg19_coding_SNVs.bed.gz:7,8,9:1' type="path"/>
-            <cvalue name='ENCODE_DNASE' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_DNase_cluster_V2.bed.gz' type="path"/>
-            <cvalue name='ENCODE_TFBS' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_wgEncodeRegTfbsClusteredV3.bed.gz' type="path"/>
+        	<cvalue name='CpGislands' value='${hg38DatabaseUCSCDirectory}/cpg_islands/2019-08-15_CpG_Islands_Human_hg38.sorted_plain.bed.gz' type="path"/>
+        	<!--cvalue name='CgiMountains' value='${assembliesHG191000GenomesDirectory}/CustomDeepAnnotation/CgiMountains_chr.bed.gz:4' type="path"/-->
+        	<cvalue name='Enhancers' value='${hg38DatabasesDirectory}/enhancers/FANTOM5/F5.hg38.enhancers.sorted_plain.bed.gz' type="path"/>
+        	<cvalue name='miRNAs_sncRNAs' value='${hg38DatabasesDirectory}/miRNA/dashr.v2.sncRNA.annotation.hg38.sncType.sorted_plain.bed.gz' type="path"/>
+        	<cvalue name='miRBase' value='${hg38DatabasesDirectory}/miRNA/miRBase_version-22_hg38.sorted_plain.bed.gz' type="path"/>
+        	<!--cvalue name='miRNAtargets' value='${hg38DatabasesDirectory}/miRNA/miRNAsites_TargetScan_BartelLab_2011_01_14_chr.bed.gz' type="path"/-->
+        	<!--cvalue name='phastConsElem20bp' value='${hg38DatabaseUCSCDirectory}/phastConsElem_min20bp_chr.bed.gz:4' type="path"/-->
+        	<!--cvalue name='TFBScons' value='${hg38DatabaseUCSCDirectory}/tfbsConsSites_noncoding_merged_chr.bed.gz' type="path"/-->
+        	<cvalue name='COSMIC' value='${hg38DatabasesDirectory}/COSMIC/cosmic_v81/Cosmic_v81_hg38_all.coding_snvs.bed.gz:7,8,9:1' type="path"/>
+        	<cvalue name='ENCODE_DNASE' value='${hg38DatabaseUCSCDirectory}/DNAase/2019-08-15_Encode_DNAase_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
+        	<cvalue name='ENCODE_TFBS' value='${hg38DatabaseUCSCDirectory}/TF_clusters/2019-08-15_Encode_TF_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
         </configurationValueBundle>
         <!--#>PIPE_CONFIG-->
 
@@ -196,8 +201,8 @@
         </tool>
         <tool name="snvCallingNoControl" value="snvCalling.sh" basepath="snvPipeline">
             <resourcesets>
-                <rset size="s" memory="1" cores="1" nodes="1" walltime="1"/>
-                <rset size="m" memory="1" cores="1" nodes="1" walltime="4"/>
+                <rset size="s" memory="1" cores="1" nodes="1" walltime="6"/>
+                <rset size="m" memory="1" cores="1" nodes="1" walltime="12"/>
                 <rset size="l" memory="1" cores="1" nodes="1" walltime="24"/>
                 <rset size="xl" memory="4" cores="1" nodes="1" walltime="48"/>
             </resourcesets>
@@ -223,9 +228,9 @@
         <tool name="snvAnnotation" value="snvAnnotation.sh" basepath="snvPipeline">
             <resourcesets>
                 <rset size="t" memory="1" cores="1" nodes="1" walltime="1" queue="devel"/>
-                <rset size="s" memory="3" cores="2" nodes="1" walltime="4"/>
-                <rset size="m" memory="5" cores="3" nodes="1" walltime="6"/>
-                <rset size="l" memory="5" cores="3" nodes="1" walltime="10"/>
+                <rset size="s" memory="3" cores="2" nodes="1" walltime="6"/>
+                <rset size="m" memory="5" cores="3" nodes="1" walltime="12"/>
+                <rset size="l" memory="5" cores="3" nodes="1" walltime="24"/>
                 <rset size="xl" memory="6" cores="3" nodes="1" walltime="48"/>
             </resourcesets>
             <input type="file" typeof="de.dkfz.b080.co.files.SNVAnnotationFile" scriptparameter="FILENAME_VCF_IN"/>
@@ -239,10 +244,10 @@
         <tool name="snvDeepAnnotation" value="vcf_pipeAnnotator.sh" basepath="tools">
             <resourcesets>
                 <rset size="t" memory="1" cores="1" nodes="1" walltime="1" queue="devel"/>
-                <rset size="s" memory="1" cores="3" nodes="1" walltime="1"/>
-                <rset size="m" memory="1" cores="3" nodes="1" walltime="2"/>
-                <rset size="l" memory="1" cores="3" nodes="1" walltime="3"/>
-                <rset size="xl" memory="4" cores="3" nodes="1" walltime="4"/>
+                <rset size="s" memory="1" cores="3" nodes="1" walltime="4"/>
+                <rset size="m" memory="1" cores="3" nodes="1" walltime="8"/>
+                <rset size="l" memory="1" cores="3" nodes="1" walltime="12"/>
+                <rset size="xl" memory="4" cores="3" nodes="1" walltime="24"/>
             </resourcesets>
             <input type="file" typeof="de.dkfz.b080.co.files.SNVAnnotationFile" scriptparameter="FILENAME_VCF"/>
             <output type="filegroup" typeof="de.dkfz.b080.co.files.VCFFileWithCheckpointFile">

From c5142972e37c1e916d865108b9726306d749aee3 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Tue, 21 Jan 2020 14:39:32 +0100
Subject: [PATCH 05/49] Removing the hg38 encode blacklist filter

---
 .../snvPipeline/confidenceAnnotation_SNVs.py          |  7 +++++++
 resources/configurationFiles/analysisSNVCalling.xml   | 11 ++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index 0e2aa0b..6c8d22c 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -320,6 +320,13 @@ def main(args):
             reasons += "Segmental_dup(-2)"
             filterfield["RE"] = 1
 
+        # Encode blacklist, generated for hg38 mappings
+#        if help["ENCODE_BLACKLIST_VALID"]:
+#            confidence -= 3 # High signal region and low mappability
+#            is_weird = True
+#            reasons += "Blacklist(-3)"
+#            filterfield["BL"] = 1
+
         # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat
         #if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]:
         #    confidence -= 3 # really bad region, usually centromeric repeats
diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
index 7b1f92e..bff9649 100755
--- a/resources/configurationFiles/analysisSNVCalling.xml
+++ b/resources/configurationFiles/analysisSNVCalling.xml
@@ -42,10 +42,10 @@
         <!--<cvalue name='REFERENCE_GENOME' value="${referenceGenome_hg19_chr}" type="string"/>-->
 
     	<!-- Reference genomes -->
-    	<cvalue name='REFERENCE_GENOME' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/GRCh38_d1_vd1_phiX.fa' type="path"/>
-    	<cvalue name='INDEX_PREFIX' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/bwa-0.7.15/GRCh38_d1_vd1_phiX.fa' type="path"/>
-    	<cvalue name='CHROM_SIZES_FILE' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv' type="path" />
-    	<cvalue name='CHROMOSOME_LENGTH_FILE' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLength.tsv' type="path" />
+    	<cvalue name='REFERENCE_GENOME' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/GRCh38_d1_vd1_phiX.fa' type="path"/>
+    	<cvalue name='INDEX_PREFIX' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/bwa-0.7.15/GRCh38_d1_vd1_phiX.fa' type="path"/>
+    	<cvalue name='CHROM_SIZES_FILE' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv' type="path" />
+    	<cvalue name='CHROMOSOME_LENGTH_FILE' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLength.tsv' type="path" />
 
         <!--## Databases -->
         <!-- hg19DatabasesDirectory=/icgc/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases -->
@@ -56,7 +56,7 @@
         <cvalue name="GNOMAD_WES_COL" value="GNOMAD_EXOMES" type="string"/>
         <cvalue name="GNOMAD_WES_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v2.1.1/liftover_grch38/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path"/>
         <cvalue name="GNOMAD_WGS_COL" value="GNOMAD_GENOMES" type="string"/>
-        <cvalue name="GNOMAD_WGS_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v2.1.1/liftover_grch38/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path"/>
+        <cvalue name="GNOMAD_WGS_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v3/genomes/GRCh38/gnomad.genomes.r3.0.sites.SNV.vcf.gz" type="path"/>
         <cvalue name="LOCALCONTROL" value="${hg38DatabasesDirectory}/liftover/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.SNVs.AFgt1.vcf.gz" type="path" />
         <cvalue name="LOCALCONTROL_COL" value="LocalControlAF" type="string"/>
 
@@ -132,6 +132,7 @@
         	<!--cvalue name='HISEQDEPTH' value='${hg38DatabaseUCSCDirectory}/HiSeqDepthTop10Pct_chr.bed.gz' type="path"/-->
         	<cvalue name='SIMPLE_TANDEMREPEATS' value='${hg38DatabaseUCSCDirectory}/simple_tandem_repeats/2019-08-15_SimpleTandemRepeats_Human_hg38.sorted_plain.bed.gz:4' type="path"/>
         	<cvalue name='REPEAT_MASKER' value='${hg38DatabaseUCSCDirectory}/repeatMasker/2019-08-14_RepeatMasker_Human_hg38.sorted_plain.bed.gz' type="path"/>
+                <!--cvalue name='ENCODE_BLACKLIST' value='${hg38DatabaseUCSCDirectory}/encode_blacklist/hg38-blacklist.v2.bed.gz' type="path"/-->
         	<!--cvalue name='DUKE_EXCLUDED' value='${hg38DatabaseUCSCDirectory}/DukeExcluded_chr.bed.gz' type="path"/-->
         	<!--cvalue name='DAC_BLACKLIST' value='${hg38DatabaseUCSCDirectory}/DACBlacklist_chr.bed.gz' type="path"/-->
         	<!--cvalue name='SELFCHAIN' value='${hg38DatabaseUCSCDirectory}/selfChain_chr.bed.gz:4::\-\-maxNrOfMatches=5' type="path"/-->

From 3bebd1f64f59b0a75b298255d81d71c47227e11f Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 7 Jan 2021 11:38:26 +0100
Subject: [PATCH 06/49] Variant calling from CRAM files

---
 resources/analysisTools/snvPipeline/filter_PEoverlap.py | 6 +++---
 resources/analysisTools/snvPipeline/snvAnnotation.sh    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
index b8c06c1..6a4bd18 100755
--- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py
+++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
@@ -123,15 +123,15 @@ def performAnalysis(args):
     #vcfInFile = open(args.inf, "r")
     #outFile = open(args.outf, "w")
 
-    # Reference file for BAQ_recalcuation and local realignment
-    reference_file = pysam.Fastafile(args.refFileName)
+    # Reference file for CRAM files
+    reference_file = args.refFileName
 
     mode = "r"
     if args.alignmentFile[-4:] == ".bam":
         mode += "b"
     elif args.alignmentFile[-5:] == ".cram":
         mode += "c"
-    samfile = pysam.Samfile(args.alignmentFile, mode)  # This should work for BAM file only (with random access).
+    samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file)  # This should work for BAM file only (with random access).
 
     if args.altPosF != '':
         ALT_basePositions_file = args.altPosF
diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh
index 5a121bd..4400766 100755
--- a/resources/analysisTools/snvPipeline/snvAnnotation.sh
+++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh
@@ -17,7 +17,7 @@ declare -r outputDirectory=`dirname ${FILENAME_VCF_OUT}`
 relinked=false
 # (Re-)link bam file and index. The pipeline cannot handle i.e. .mdup.bam and .mdup.bai
 # Look up the index with .bam.bai. If the file exists nothing happens.
-if [[ ! -f ${TUMOR_BAMFILE_FULLPATH_BP}.bai ]]
+if [[ ! -f ${TUMOR_BAMFILE_FULLPATH_BP}.bai && ! -f ${TUMOR_BAMFILE_FULLPATH_BP}.crai ]]
 then
     shortBamIdx=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}`"/"`basename ${TUMOR_BAMFILE_FULLPATH_BP} .bam`".bai"
     [[ ! -f ${shortBamIdx} ]] && echo "Bam index file cannot be found!" && exit -15

From b0ac261f6b9edb811e784bc352071bfa45f91a82 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 11 Feb 2021 13:14:58 +0100
Subject: [PATCH 07/49] Checking for nonREFnonALT in BoolCounter class

---
 .../snvPipeline/filter_PEoverlap.py           | 72 +++++++++++--------
 1 file changed, 43 insertions(+), 29 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
index 6a4bd18..d6f6a97 100755
--- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py
+++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
@@ -90,24 +90,29 @@ def decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, D
         if remove_is_reverse:
             if DP4ar > 0: DP4ar -= 1
         else:
-            if DP4af > 0: DP4af -= 1   
+            if DP4af > 0: DP4af -= 1
     return(DP4rf, DP4rr, DP4af, DP4ar)
 
 class BoolCounter:
     """ A class for counter objects """ 
 
-    def __init__(self):
-        self.ref = 0
-        self.alt = 0
+    def __init__(self, ref, alt):
+        self.REF = ref
+        self.ALT = alt
+        self.ref_count = 0
+        self.alt_count = 0
 
-    def update(self, remove):
-        if (remove):
-            self.ref += 1
-        else:
-            self.alt += 1
+    def update(self, current_base):
+        if self.REF == current_base:
+            self.ref_count += 1
+        elif self.ALT == current_base:
+            self.alt_count += 1
+
+    def update_nonREFnonALT(self, remove_count):
+        self.alt_count =+ remove_count
 
     def counted(self):
-        return [self.ref, self.alt]
+        return [self.ref_count, self.alt_count]
 
 # MAIN ANALYSIS PROCEDURE
 def performAnalysis(args):
@@ -129,9 +134,12 @@ def performAnalysis(args):
     mode = "r"
     if args.alignmentFile[-4:] == ".bam":
         mode += "b"
+        multiple_iterators = False
+        samfile = pysam.Samfile(args.alignmentFile, mode)
     elif args.alignmentFile[-5:] == ".cram":
         mode += "c"
-    samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file)  # This should work for BAM file only (with random access).
+        multiple_iterators = False
+        samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file)        
 
     if args.altPosF != '':
         ALT_basePositions_file = args.altPosF
@@ -195,12 +203,13 @@ def performAnalysis(args):
 
             ACGTNacgtn1 = [0]*10
             ACGTNacgtn2 = [0]*10
-            count_PE = BoolCounter() # Starting the counter for the forward and reverse reads removed due to PE overlap detection  
-            count_supple = BoolCounter() # "" for supplementary reads, since flag_filter is added, entire supplementary detection can be removed in future versions
-            count_mismatch = BoolCounter() # " for mismatch report 
+            count_PE = BoolCounter(REF, ALT) # Starting the counter for the forward and reverse reads removed due to PE overlap detection  
+            count_supple = BoolCounter(REF, ALT) # "" for supplementary reads, since flag_filter is added, entire supplementary detection can be removed in future versions
+            count_mismatch = BoolCounter(REF, ALT) # " for mismatch report
+            count_nonREFnonALT = BoolCounter(REF, ALT) # " for nonREF and nonALT bases
 
             # To match pysam and mpileup counts, a reference file is added. Given the reference file, Pysam by default computes BAQ (compute_baq).
-            for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=3844, redo_baq=True, ignore_overlaps=False):
+            for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=3844, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators):
                 if pileupcolumn.pos == (pos-1):                	
                     #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments)                    
                     for pileupread in pileupcolumn.pileups:                    	
@@ -244,7 +253,7 @@ def performAnalysis(args):
                                             if numberOfMismatches > args.allowedNumberOfMismatches:
                                                 remove_base = pileupread.alignment.seq[pileupread.query_position]
                                                 remove_is_reverse = pileupread.alignment.is_reverse
-                                                count_mismatch.update(remove_base == REF)
+                                                count_mismatch.update(remove_base)
                                                 (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)                                         
                                                 # after decreasing the respective DP4 value, go directly to the next read
                                                 # without remembering the current read
@@ -277,7 +286,6 @@ def performAnalysis(args):
 
                                         #if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq:        # DEBUG July 23 2012: BROAD BAM problem due to pileupread.alignment.qqual being shorter sometimes than pileupread.alignment.qual
                                         if(pileupread.alignment.query_name in readNameHash):
-                                            #print pileupread.alignment.query_name
                                             old_qual = readNameHash[pileupread.alignment.query_name][0]
                                             old_base = readNameHash[pileupread.alignment.query_name][1]
                                             old_is_reverse = readNameHash[pileupread.alignment.query_name][2]
@@ -303,9 +311,10 @@ def performAnalysis(args):
                                                 remove_base = current_base
                                                 remove_is_reverse = current_is_reverse
                                                 remove_old = False
-                                            
-                                            count_PE.update(remove_base == REF)
-                                            (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)                                                                                        
+
+                                            count_PE.update(remove_base)
+                                            (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)
+
                                             # If current base is better, then removing the information about old mate
                                             # If current base is not good, then do nothing
                                             if remove_old:
@@ -382,8 +391,7 @@ def performAnalysis(args):
                     break # only one pileup for a position
 
             # Calculating duplicates based on read-mate pair's start positions (chr id and start location)
-            count_duplicate = BoolCounter()
-
+            count_duplicate = BoolCounter(REF, ALT)
             for key in readMateHash:
             	value_length = len(readMateHash[key])            	            	
             	if value_length > 0:            		
@@ -394,20 +402,26 @@ def performAnalysis(args):
                         remove_base = decreaseInfo[0]
                         remove_is_reverse = decreaseInfo[1]
 
-                        count_duplicate.update(remove_base == REF)
+                        count_duplicate.update(remove_base)
                         (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)
 
             nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd
+
+            if (DP4[2] + DP4[3]) > ALTcount:    # that the ALTcount is larger  happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling                        
+                if DP4af >= nonREFnonALTfwd: 
+                    DP4af -= nonREFnonALTfwd
+                    count_nonREFnonALT.update_nonREFnonALT(nonREFnonALTfwd)
+
+                if DP4ar >= nonREFnonALTrev: 
+                    DP4ar -= nonREFnonALTrev
+                    count_nonREFnonALT.update_nonREFnonALT(nonREFnonALTrev)
+            
             #Format: DP2 -> "reference(forward + reverse), alt(forward + reverse)"
             supple_dup_str = 'DP2sup=' + ','.join(map(str, count_supple.counted()))
-            supple_dup_str += 'DP2dup=' + ','.join(map(str, count_duplicate.counted()))
+            supple_dup_str += ';DP2dup=' + ','.join(map(str, count_duplicate.counted()))
             supple_dup_str += ';DP2pairEnd=' + ','.join(map(str, count_PE.counted()))
             supple_dup_str += ';DP2mis=' + ','.join(map(str, count_mismatch.counted()))
-            supple_dup_str += ';DPnonREFnonALT=' + str(nonREFnonALT)
-
-            if (DP4[2] + DP4[3]) > ALTcount:    # that the ALTcount is larger  happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling                
-                if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd
-                if DP4ar >= nonREFnonALTrev: DP4ar -= nonREFnonALTrev
+            supple_dup_str += ';DP2nonREFnonALT=' + ','.join(map(str, count_nonREFnonALT.counted()))
             
             ACGTNacgtn1_string = "ACGTNacgtnPLUS="+",".join([str(i) for i in ACGTNacgtn1])
             ACGTNacgtn2_string = "ACGTNacgtnMINUS="+",".join([str(i) for i in ACGTNacgtn2])

From 674900f419d71119837bbf5cc0355d4463b6d386 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 11 Feb 2021 13:33:46 +0100
Subject: [PATCH 08/49] updating pysam to 0.16.0.1

---
 .../analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh
index 69338b1..e1aeb9d 100755
--- a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh
+++ b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh
@@ -19,7 +19,7 @@ module load bedtools/"${BEDTOOLS_VERSION:?BEDTOOLS_VERSION undefined}"
 module load pypy/"${PYPY_VERSION:?PYPY_VERSION undefined}"
 
 set +ue
-source /odcf/cluster/virtualenvs/paramasi/python_2.7.9_SNVCalling_pysam_0.15.2/bin/activate
+source /dkfz/cluster/virtualenvs/paramasi/python_2.7.9_SNVCalling_pysam_0.16.0.1/bin/activate
 set -ue
 
 export BGZIP_BINARY=bgzip

From 128c1c0bc0ea39f2231f74f5c0567bf518527043 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 11 Feb 2021 13:34:23 +0100
Subject: [PATCH 09/49] Moving files to ngs_share

---
 .../configurationFiles/analysisSNVCalling.xml | 20 +++++++------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
index bff9649..3afbcc1 100755
--- a/resources/configurationFiles/analysisSNVCalling.xml
+++ b/resources/configurationFiles/analysisSNVCalling.xml
@@ -49,15 +49,18 @@
 
         <!--## Databases -->
         <!-- hg19DatabasesDirectory=/icgc/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases -->
+        <cvalue name="hg38DatabasesDirectory" value="/icgc/ngs_share/assemblies/hg_GRCh38/databases" type="path" />
+        <cvalue name="hg38DatabaseUCSCDirectory" value="/icgc/ngs_share/assemblies/hg_GRCh38/databases/UCSC" type="path" /> 
+
         <cvalue name="DBSNP" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
         <cvalue name="DBSNP_COL" value="DBSNP" type="string"/>
         <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
         <cvalue name="KGENOMES_COL" value="1K_GENOMES" type="string"/>
         <cvalue name="GNOMAD_WES_COL" value="GNOMAD_EXOMES" type="string"/>
-        <cvalue name="GNOMAD_WES_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v2.1.1/liftover_grch38/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path"/>
+        <cvalue name="GNOMAD_WES_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path"/>
         <cvalue name="GNOMAD_WGS_COL" value="GNOMAD_GENOMES" type="string"/>
-        <cvalue name="GNOMAD_WGS_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v3/genomes/GRCh38/gnomad.genomes.r3.0.sites.SNV.vcf.gz" type="path"/>
-        <cvalue name="LOCALCONTROL" value="${hg38DatabasesDirectory}/liftover/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.SNVs.AFgt1.vcf.gz" type="path" />
+        <cvalue name="GNOMAD_WGS_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.SNV.vcf.gz" type="path"/>
+        <cvalue name="LOCALCONTROL" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.SNVs.AFgt1.vcf.gz" type="path" />
         <cvalue name="LOCALCONTROL_COL" value="LocalControlAF" type="string"/>
 
         <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/annotate_variation.pl' type='path'/>
@@ -128,14 +131,9 @@
 
 
         <configurationValueBundle name='PIPE_CONFIG:SNV_RELIABILITY'>
-        	<cvalue name='MAPABILITY' value='${hg38DatabasesDirectory}/mapability/GRCh38_Mappability_Align_100mer.bedGraph.gz' type="path"/>
-        	<!--cvalue name='HISEQDEPTH' value='${hg38DatabaseUCSCDirectory}/HiSeqDepthTop10Pct_chr.bed.gz' type="path"/-->
+        	<cvalue name='MAPABILITY' value='${hg38DatabasesDirectory}/mappability/GRCh38_Mappability_Align_100mer.bedGraph.gz' type="path"/>
         	<cvalue name='SIMPLE_TANDEMREPEATS' value='${hg38DatabaseUCSCDirectory}/simple_tandem_repeats/2019-08-15_SimpleTandemRepeats_Human_hg38.sorted_plain.bed.gz:4' type="path"/>
         	<cvalue name='REPEAT_MASKER' value='${hg38DatabaseUCSCDirectory}/repeatMasker/2019-08-14_RepeatMasker_Human_hg38.sorted_plain.bed.gz' type="path"/>
-                <!--cvalue name='ENCODE_BLACKLIST' value='${hg38DatabaseUCSCDirectory}/encode_blacklist/hg38-blacklist.v2.bed.gz' type="path"/-->
-        	<!--cvalue name='DUKE_EXCLUDED' value='${hg38DatabaseUCSCDirectory}/DukeExcluded_chr.bed.gz' type="path"/-->
-        	<!--cvalue name='DAC_BLACKLIST' value='${hg38DatabaseUCSCDirectory}/DACBlacklist_chr.bed.gz' type="path"/-->
-        	<!--cvalue name='SELFCHAIN' value='${hg38DatabaseUCSCDirectory}/selfChain_chr.bed.gz:4::\-\-maxNrOfMatches=5' type="path"/-->
         </configurationValueBundle>
 
 
@@ -152,13 +150,9 @@
             # if any whitespaces are required, use quotation marks like in this example: #MAPABILITY="/ibios/co02/annotation/hg19/wgEncodeCrgMapabilityAlign100mer.bedGraph.gz:::-breakPointMode -aEndOffset=1"-->
         <configurationValueBundle name='PIPE_CONFIG:SNV_DEEPANNOTATION'>
         	<cvalue name='CpGislands' value='${hg38DatabaseUCSCDirectory}/cpg_islands/2019-08-15_CpG_Islands_Human_hg38.sorted_plain.bed.gz' type="path"/>
-        	<!--cvalue name='CgiMountains' value='${assembliesHG191000GenomesDirectory}/CustomDeepAnnotation/CgiMountains_chr.bed.gz:4' type="path"/-->
         	<cvalue name='Enhancers' value='${hg38DatabasesDirectory}/enhancers/FANTOM5/F5.hg38.enhancers.sorted_plain.bed.gz' type="path"/>
         	<cvalue name='miRNAs_sncRNAs' value='${hg38DatabasesDirectory}/miRNA/dashr.v2.sncRNA.annotation.hg38.sncType.sorted_plain.bed.gz' type="path"/>
         	<cvalue name='miRBase' value='${hg38DatabasesDirectory}/miRNA/miRBase_version-22_hg38.sorted_plain.bed.gz' type="path"/>
-        	<!--cvalue name='miRNAtargets' value='${hg38DatabasesDirectory}/miRNA/miRNAsites_TargetScan_BartelLab_2011_01_14_chr.bed.gz' type="path"/-->
-        	<!--cvalue name='phastConsElem20bp' value='${hg38DatabaseUCSCDirectory}/phastConsElem_min20bp_chr.bed.gz:4' type="path"/-->
-        	<!--cvalue name='TFBScons' value='${hg38DatabaseUCSCDirectory}/tfbsConsSites_noncoding_merged_chr.bed.gz' type="path"/-->
         	<cvalue name='COSMIC' value='${hg38DatabasesDirectory}/COSMIC/cosmic_v81/Cosmic_v81_hg38_all.coding_snvs.bed.gz:7,8,9:1' type="path"/>
         	<cvalue name='ENCODE_DNASE' value='${hg38DatabaseUCSCDirectory}/DNAase/2019-08-15_Encode_DNAase_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
         	<cvalue name='ENCODE_TFBS' value='${hg38DatabaseUCSCDirectory}/TF_clusters/2019-08-15_Encode_TF_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>

From 2de0727d2e90798cd017bfd5f40fee5935f7b844 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 11 Mar 2021 10:27:11 +0100
Subject: [PATCH 10/49] Updating BoolCounter class

---
 .../snvPipeline/filter_PEoverlap.py           | 46 +++++++------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
index d6f6a97..f0cabc3 100755
--- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py
+++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
@@ -96,20 +96,20 @@ def decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, D
 class BoolCounter:
     """ A class for counter objects """ 
 
-    def __init__(self, ref, alt):
-        self.REF = ref
-        self.ALT = alt
+    def __init__(self, ref_base, alt_base):
+        self.ref_base = ref_base
+        self.alt_base = alt_base
         self.ref_count = 0
         self.alt_count = 0
 
     def update(self, current_base):
-        if self.REF == current_base:
+        if self.ref_base == current_base:
             self.ref_count += 1
-        elif self.ALT == current_base:
+        elif self.alt_base == current_base:
             self.alt_count += 1
 
-    def update_nonREFnonALT(self, remove_count):
-        self.alt_count =+ remove_count
+    def update_nonREFnonALT(self, count):
+        self.alt_count =+ count
 
     def counted(self):
         return [self.ref_count, self.alt_count]
@@ -132,13 +132,12 @@ def performAnalysis(args):
     reference_file = args.refFileName
 
     mode = "r"
-    if args.alignmentFile[-4:] == ".bam":
-        mode += "b"
-        multiple_iterators = False
+    multiple_iterators = False
+    if args.alignmentFile.split(".")[-1] == "bam":
+        mode += "b"        
         samfile = pysam.Samfile(args.alignmentFile, mode)
-    elif args.alignmentFile[-5:] == ".cram":
-        mode += "c"
-        multiple_iterators = False
+    elif args.alignmentFile.split(".")[-1] == "cram":
+        mode += "c"        
         samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file)        
 
     if args.altPosF != '':
@@ -274,14 +273,6 @@ def performAnalysis(args):
                                             ACGTNacgtn1[ACGTNacgtn_index[1]] += 1
                                         else:
                                             ACGTNacgtn2[ACGTNacgtn_index[1]] += 1
- 
-                                        # Supplementary reads are removed with 'flag_filter=3844' argument in samfile.pileup() above, so this block is commented out. 
-                                        #if(pileupread.alignment.is_supplementary):
-                                        #    remove_base = pileupread.alignment.seq[pileupread.query_position]
-                                        #    remove_is_reverse = pileupread.alignment.is_reverse
-                                        #    count_supple.update(remove_base == REF)
-                                        #    (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)
-                                        #    continue
 
 
                                         #if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq:        # DEBUG July 23 2012: BROAD BAM problem due to pileupread.alignment.qqual being shorter sometimes than pileupread.alignment.qual
@@ -405,16 +396,13 @@ def performAnalysis(args):
                         count_duplicate.update(remove_base)
                         (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)
 
-            nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd
-
             if (DP4[2] + DP4[3]) > ALTcount:    # that the ALTcount is larger  happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling                        
-                if DP4af >= nonREFnonALTfwd: 
-                    DP4af -= nonREFnonALTfwd
-                    count_nonREFnonALT.update_nonREFnonALT(nonREFnonALTfwd)
+                if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd                
 
-                if DP4ar >= nonREFnonALTrev: 
-                    DP4ar -= nonREFnonALTrev
-                    count_nonREFnonALT.update_nonREFnonALT(nonREFnonALTrev)
+                if DP4ar >= nonREFnonALTrev: DP4ar -= nonREFnonALTrev
+                
+                nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd
+                count_nonREFnonALT.update_nonREFnonALT(nonREFnonALT)
             
             #Format: DP2 -> "reference(forward + reverse), alt(forward + reverse)"
             supple_dup_str = 'DP2sup=' + ','.join(map(str, count_supple.counted()))

From d8f3cf166c553992b37d3b63458ad381bbe876b8 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 11 Mar 2021 10:31:42 +0100
Subject: [PATCH 11/49] REF name via BAM header

---
 resources/analysisTools/snvPipeline/filter_vcf.sh | 4 ++--
 resources/analysisTools/snvPipeline/snvCalling.sh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh
index 1d57f0e..dc71e1e 100755
--- a/resources/analysisTools/snvPipeline/filter_vcf.sh
+++ b/resources/analysisTools/snvPipeline/filter_vcf.sh
@@ -19,8 +19,8 @@ mpileupDirectory=`dirname ${FILENAME_VCF}`
 RUN_PLOTS=${RUN_PLOTS-1}
 RUN_PUREST=${RUN_PUREST-1}
 
-#source ${TOOL_ANALYZE_BAM_HEADER}
-#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
+source ${TOOL_ANALYZE_BAM_HEADER}
+getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
 ALIGNMENT_FOLDER=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}`
 
 # bugfix: ensure to interpret CHROMOSOME_INDICES as array - otherwise TOOL_INTERMUTATION_DISTANCE_COORD_COLOR will fail...
diff --git a/resources/analysisTools/snvPipeline/snvCalling.sh b/resources/analysisTools/snvPipeline/snvCalling.sh
index 92f1113..de88566 100755
--- a/resources/analysisTools/snvPipeline/snvCalling.sh
+++ b/resources/analysisTools/snvPipeline/snvCalling.sh
@@ -24,8 +24,8 @@ fi
 PARM_CHR_INDEX=${PARM_CHR_INDEX-}
 
 # Ignore chr prefix and set it manually
-#source ${TOOL_ANALYZE_BAM_HEADER}
-#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
+source ${TOOL_ANALYZE_BAM_HEADER}
+getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
 
 chr=""
 if [[ -n "$PARM_CHR_INDEX" ]]

From b99be8f5a627b4e3c447262db17f2d6e7171997e Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 11 Mar 2021 10:34:28 +0100
Subject: [PATCH 12/49] Reverting generic xml to hg19

---
 .../configurationFiles/analysisSNVCalling.xml | 70 ++++++++++---------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
index 3afbcc1..6049fab 100755
--- a/resources/configurationFiles/analysisSNVCalling.xml
+++ b/resources/configurationFiles/analysisSNVCalling.xml
@@ -39,34 +39,32 @@
         <cvalue name='SNVFILE_PREFIX' value='snvs_' type="string"/>
         <!--<cvalue name='analysisMethodNameOnInput' value='whole_genome_analysis' type='string' />-->
         <cvalue name="RAW_SNV_FILTER_OPTIONS" value='" --minVac=3 --minVaf=0.03 --minVacPS=2"' type="string"/>
-        <!--<cvalue name='REFERENCE_GENOME' value="${referenceGenome_hg19_chr}" type="string"/>-->
+        
 
     	<!-- Reference genomes -->
-    	<cvalue name='REFERENCE_GENOME' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/GRCh38_d1_vd1_phiX.fa' type="path"/>
-    	<cvalue name='INDEX_PREFIX' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/bwa-0.7.15/GRCh38_d1_vd1_phiX.fa' type="path"/>
-    	<cvalue name='CHROM_SIZES_FILE' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv' type="path" />
-    	<cvalue name='CHROMOSOME_LENGTH_FILE' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLength.tsv' type="path" />
-
-        <!--## Databases -->
-        <!-- hg19DatabasesDirectory=/icgc/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases -->
-        <cvalue name="hg38DatabasesDirectory" value="/icgc/ngs_share/assemblies/hg_GRCh38/databases" type="path" />
-        <cvalue name="hg38DatabaseUCSCDirectory" value="/icgc/ngs_share/assemblies/hg_GRCh38/databases/UCSC" type="path" /> 
-
-        <cvalue name="DBSNP" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
+        <cvalue name='REFERENCE_GENOME' value="${referenceGenome_hg19_chr}" type="string"/>
+        
+        <!--## Databases -->        
+        <cvalue name='hg19DatabasesDirectory' value="/icgc/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases" type="path" />        
+        <cvalue name="hg19DatabaseUCSCDirectory" value="/icgc/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases/UCSC" type="path" />
+        
+        <cvalue name="DBSNP" value="${hg19DatabasesDirectory}/dbSNP/dbSNP_147/00-All.SNV.vcf.gz" type="path"/>
         <cvalue name="DBSNP_COL" value="DBSNP" type="string"/>
-        <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
+        <cvalue name="KGENOME" value="${hg19DatabasesDirectory}/1000genomes/ALL.wgs.phase1_integrated_calls.20101123.snps_chr.vcf.gz" type="path"/>
         <cvalue name="KGENOMES_COL" value="1K_GENOMES" type="string"/>
         <cvalue name="GNOMAD_WES_COL" value="GNOMAD_EXOMES" type="string"/>
-        <cvalue name="GNOMAD_WES_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path"/>
+        <cvalue name="GNOMAD_WES_ALL_SNV" value= "${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.exomes.r2.1.sites.SNV.vcf.gz" type="path"/>
         <cvalue name="GNOMAD_WGS_COL" value="GNOMAD_GENOMES" type="string"/>
-        <cvalue name="GNOMAD_WGS_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.SNV.vcf.gz" type="path"/>
-        <cvalue name="LOCALCONTROL" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.SNVs.AFgt1.vcf.gz" type="path" />
-        <cvalue name="LOCALCONTROL_COL" value="LocalControlAF" type="string"/>
+        <cvalue name="GNOMAD_WGS_ALL_SNV" value= "${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.genomes.r2.1.sites.SNV.vcf.gz" type="path"/>
+        <cvalue name="LOCALCONTROL_WGS" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/MPsnvs_PLindels/ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.SNVs.AFgt1.vcf.gz" type="path" />
+        <cvalue name="LOCALCONTROL_WGS_COL" value="LocalControlAF_WGS" type="string"/>
+        <cvalue name="LOCALCONTROL_WES" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/MPsnvs_PLindels/ExclusionList_2019_H021_MP_PL_WES.SNVs.AFgt1.vcf.gz" type="path" />
+        <cvalue name="LOCALCONTROL_WES_COL" value="LocalControlAF_WES" type="string"/>
 
-        <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/annotate_variation.pl' type='path'/>
-        <cvalue name='ANNOVAR_BUILDVER' value='hg38' type='string'/>
-        <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/humandb/' type='path'/>
-        <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV31'" type='string'/>
+        <cvalue name='ANNOVAR_BINARY' value='${sharedFilesBaseDirectory}/annovar/annovar_Feb2016/annotate_variation.pl' type='path'/>
+        <cvalue name='ANNOVAR_BUILDVER' value='hg19' type='string'/>
+        <cvalue name='ANNOVAR_DBPATH' value='${sharedFilesBaseDirectory}/annovar/annovar_Feb2016/humandb/' type='path'/>
+        <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV19'" type='string'/>
         <cvalue name="ANNOVAR_GENEANNO_COLS" value='"ANNOVAR_FUNCTION,GENE,EXONIC_CLASSIFICATION,ANNOVAR_TRANSCRIPTS"' type="string"/>
         <cvalue name="ANNOVAR_SEGDUP_COL" value="SEGDUP" type="string"/>
         <cvalue name="ANNOVAR_CYTOBAND_COL" value="CYTOBAND" type="string"/>
@@ -93,7 +91,7 @@
             <cvalue name="CRIT_GNOMAD_GENOMES_maxMAF" value="0.001" type="float"/>
             <cvalue name="CRIT_1KGENOMES_maxMAF" value="0.01" type="string" />
             <cvalue name="CRIT_RECURRENCE" value="7" type="string" />
-            <cvalue name="CRIT_LOCALCONTROL_maxMAF" value="0.01" type="string" />
+            <cvalue name="CRIT_LOCALCONTROL_maxMAF" value="0.05" type="string" />
         <!--#### END EXTRA FILTERING -->
 
         <cvalue name="CONFIDENCE_OPTS" value="-t 500" type="string"
@@ -131,9 +129,13 @@
 
 
         <configurationValueBundle name='PIPE_CONFIG:SNV_RELIABILITY'>
-        	<cvalue name='MAPABILITY' value='${hg38DatabasesDirectory}/mappability/GRCh38_Mappability_Align_100mer.bedGraph.gz' type="path"/>
-        	<cvalue name='SIMPLE_TANDEMREPEATS' value='${hg38DatabaseUCSCDirectory}/simple_tandem_repeats/2019-08-15_SimpleTandemRepeats_Human_hg38.sorted_plain.bed.gz:4' type="path"/>
-        	<cvalue name='REPEAT_MASKER' value='${hg38DatabaseUCSCDirectory}/repeatMasker/2019-08-14_RepeatMasker_Human_hg38.sorted_plain.bed.gz' type="path"/>
+            <cvalue name='MAPABILITY' value='${hg19DatabaseUCSCDirectory}/wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz' type="path"/>
+            <cvalue name='HISEQDEPTH' value='${hg19DatabaseUCSCDirectory}/HiSeqDepthTop10Pct_chr.bed.gz' type="path"/>
+            <cvalue name='SIMPLE_TANDEMREPEATS' value='${hg19DatabaseUCSCDirectory}/repeats/SimpleTandemRepeats_chr.bed.gz:4' type="path"/>
+            <cvalue name='REPEAT_MASKER' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_RepeatMasker.bed.gz' type="path"/>
+            <cvalue name='DUKE_EXCLUDED' value='${hg19DatabaseUCSCDirectory}/DukeExcluded_chr.bed.gz' type="path"/>
+            <cvalue name='DAC_BLACKLIST' value='${hg19DatabaseUCSCDirectory}/DACBlacklist_chr.bed.gz' type="path"/>
+            <cvalue name='SELFCHAIN' value='${hg19DatabaseUCSCDirectory}/selfChain_chr.bed.gz:4::--maxNrOfMatches=5' type="path"/>
         </configurationValueBundle>
 
 
@@ -149,13 +151,17 @@
             # see also the usage report of ngs2/trunk/tools/annotate_vcf.pl
             # if any whitespaces are required, use quotation marks like in this example: #MAPABILITY="/ibios/co02/annotation/hg19/wgEncodeCrgMapabilityAlign100mer.bedGraph.gz:::-breakPointMode -aEndOffset=1"-->
         <configurationValueBundle name='PIPE_CONFIG:SNV_DEEPANNOTATION'>
-        	<cvalue name='CpGislands' value='${hg38DatabaseUCSCDirectory}/cpg_islands/2019-08-15_CpG_Islands_Human_hg38.sorted_plain.bed.gz' type="path"/>
-        	<cvalue name='Enhancers' value='${hg38DatabasesDirectory}/enhancers/FANTOM5/F5.hg38.enhancers.sorted_plain.bed.gz' type="path"/>
-        	<cvalue name='miRNAs_sncRNAs' value='${hg38DatabasesDirectory}/miRNA/dashr.v2.sncRNA.annotation.hg38.sncType.sorted_plain.bed.gz' type="path"/>
-        	<cvalue name='miRBase' value='${hg38DatabasesDirectory}/miRNA/miRBase_version-22_hg38.sorted_plain.bed.gz' type="path"/>
-        	<cvalue name='COSMIC' value='${hg38DatabasesDirectory}/COSMIC/cosmic_v81/Cosmic_v81_hg38_all.coding_snvs.bed.gz:7,8,9:1' type="path"/>
-        	<cvalue name='ENCODE_DNASE' value='${hg38DatabaseUCSCDirectory}/DNAase/2019-08-15_Encode_DNAase_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
-        	<cvalue name='ENCODE_TFBS' value='${hg38DatabaseUCSCDirectory}/TF_clusters/2019-08-15_Encode_TF_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
+            <cvalue name='CpGislands' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_CpG_islands.bed.gz' type="path"/>
+            <cvalue name='CgiMountains' value='${assembliesHG191000GenomesDirectory}/CustomDeepAnnotation/CgiMountains_chr.bed.gz:4' type="path"/>
+            <cvalue name='Enhancers' value='${hg19DatabaseUCSCDirectory}/Enhancers_Vista_2011_01_14_chr.bed.gz' type="path"/>
+            <cvalue name='miRNAs_snoRNAs' value='${hg19DatabasesDirectory}/miRNA/miRNA_snoRNAs_miRBaseRelease15_chr.bed.gz' type="path"/>
+            <cvalue name='miRBase18' value='${hg19DatabasesDirectory}/miRNA/miRBase_version-18_chr.bed.gz' type="path"/>
+            <cvalue name='miRNAtargets' value='${hg19DatabasesDirectory}/miRNA/miRNAsites_TargetScan_BartelLab_2011_01_14_chr.bed.gz' type="path"/>
+            <cvalue name='phastConsElem20bp' value='${hg19DatabaseUCSCDirectory}/phastConsElem_min20bp_chr.bed.gz:4' type="path"/>
+            <cvalue name='TFBScons' value='${hg19DatabaseUCSCDirectory}/tfbsConsSites_noncoding_merged_chr.bed.gz' type="path"/>
+            <cvalue name='COSMIC' value='${hg19DatabasesDirectory}/COSMIC/Cosmic_v77_hg19_coding_SNVs.bed.gz:7,8,9:1' type="path"/>
+            <cvalue name='ENCODE_DNASE' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_DNase_cluster_V2.bed.gz' type="path"/>
+            <cvalue name='ENCODE_TFBS' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_wgEncodeRegTfbsClusteredV3.bed.gz' type="path"/>
         </configurationValueBundle>
         <!--#>PIPE_CONFIG-->
 

From 98bb76e568da9108ddf6759786ab3f532e19dfa1 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 11 Mar 2021 10:35:23 +0100
Subject: [PATCH 13/49] New xml for GRCh38 files

---
 .../analysisSNVCallingGRCh38.xml              | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 resources/configurationFiles/analysisSNVCallingGRCh38.xml

diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
new file mode 100644
index 0000000..77711f8
--- /dev/null
+++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
@@ -0,0 +1,68 @@
+<configuration name='snvCallingAnalysisGRCh38' description='SNV Calling for NGS data.'
+               configurationType='analysis' class='de.dkfz.roddy.core.Analysis' workflowClass='de.dkfz.b080.co.snvpipeline.SNVCallingWorkflow'
+               runtimeServiceClass='de.dkfz.b080.co.common.BasicCOProjectsRuntimeService'
+               imports="commonCOWorkflowsSettings,snvCallingAnalysis"
+               canCreateTestdata='true'
+               listOfUsedTools="snvCalling,snvJoinVcfFiles,snvAnnotation,snvDeepAnnotation,snvFilter"
+               usedToolFolders="snvPipeline,tools"
+               cleanupScript="cleanupScript">
+
+  <configurationvalues>
+
+    <!--hg38 base directory-->
+    <cvalue name='hg38BaseDirectory' value='/icgc/ngs_share/assemblies/hg_GRCh38' type='path' />    
+    
+    <!-- Reference genomes -->
+    <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
+    <cvalue name='INDEX_PREFIX' value="${hg38BaseDirectory}/indexes/bwa/bwa07_GRCh38_decoy_ebv_phiX_alt_hla_chr/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
+    <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
+    <cvalue name='CHROMOSOME_LENGTH_FILE' value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLength.tsv' type="path" />
+
+    <cvalue name='CHR_PREFIX' value='chr' type='string' />
+    <cvalue name="disableAutoBAMHeaderAnalysis" value="true" type="boolean"/>
+    
+    <!--## Databases -->
+    <cvalue name='hg38DatabasesDirectory' value="${hg38BaseDirectory}/databases" type='path' />
+    <cvalue name='hg38DatabaseUCSCDirectory' value="${hg38DatabasesDirectory}/UCSC" type='path' />    
+    
+    <cvalue name="DBSNP" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
+    <cvalue name="DBSNP_COL" value="DBSNP" type="string"/>
+    <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
+    <cvalue name="KGENOMES_COL" value="1K_GENOMES" type="string"/>
+    <cvalue name="GNOMAD_WES_COL" value="GNOMAD_EXOMES" type="string"/>
+    <cvalue name="GNOMAD_WES_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path"/>
+    <cvalue name="GNOMAD_WGS_COL" value="GNOMAD_GENOMES" type="string"/>
+    <cvalue name="GNOMAD_WGS_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.SNV.vcf.gz" type="path"/>
+    <cvalue name="LOCALCONTROL_WGS" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.SNVs.AFgt1.vcf.gz" type="path" />
+    <cvalue name="LOCALCONTROL_WGS_COL" value="LocalControlAF_WGS" type="string"/>
+    <cvalue name="LOCALCONTROL_WES" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_H021_MP_PL_WES.SNVs.AFgt1.vcf.gz" type="path" />
+    <cvalue name="LOCALCONTROL_WES_COL" value="LocalControlAF_WES" type="string"/>
+
+    <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/annotate_variation.pl' type='path'/>
+    <cvalue name='ANNOVAR_BUILDVER' value='hg38' type='string'/>
+    <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/humandb/' type='path'/>
+    <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV31'" type='string'/>
+    
+    <!--Reference annotations during confidene annotation-->
+    <cvalue name="CONFIDENCE_OPTS" value="-t 500 --refgenome GRCh38 ftp://ftp.sanger.ac.uk/pub/cancer/dockstore/human/GRCh38_hla_decoy_ebv/core_ref_GRCh38_hla_decoy_ebv.tar.gz" type="string"
+            description="max read depth in control (-t INT default 150 for genomes, set to 500 for exomes) can now be ommited to avoid depth dependent punishing by setting -c 0. It is also possible to get a justification for the confidence score by setting the -p 1 flag"/>
+    
+    <!--SNV reliability bundle-->
+    <configurationValueBundle name='PIPE_CONFIG:SNV_RELIABILITY'>
+     	<cvalue name='MAPABILITY' value='${hg38DatabasesDirectory}/mappability/GRCh38_Mappability_Align_100mer.bedGraph.gz' type="path"/>
+     	<cvalue name='SIMPLE_TANDEMREPEATS' value='${hg38DatabaseUCSCDirectory}/simple_tandem_repeats/2019-08-15_SimpleTandemRepeats_Human_hg38.sorted_plain.bed.gz:4' type="path"/>
+     	<cvalue name='REPEAT_MASKER' value='${hg38DatabaseUCSCDirectory}/repeatMasker/2019-08-14_RepeatMasker_Human_hg38.sorted_plain.bed.gz' type="path"/>
+    </configurationValueBundle>
+
+    <!--SNV deep annotation bundle-->
+    <configurationValueBundle name='PIPE_CONFIG:SNV_DEEPANNOTATION'>
+      <cvalue name='CpGislands' value='${hg38DatabaseUCSCDirectory}/cpg_islands/2019-08-15_CpG_Islands_Human_hg38.sorted_plain.bed.gz' type="path"/>
+      <cvalue name='Enhancers' value='${hg38DatabasesDirectory}/enhancers/FANTOM5/F5.hg38.enhancers.sorted_plain.bed.gz' type="path"/>
+      <cvalue name='miRNAs_sncRNAs' value='${hg38DatabasesDirectory}/miRNA/dashr.v2.sncRNA.annotation.hg38.sncType.sorted_plain.bed.gz' type="path"/>
+      <cvalue name='miRBase' value='${hg38DatabasesDirectory}/miRNA/miRBase_version-22_hg38.sorted_plain.bed.gz' type="path"/>
+      <cvalue name='COSMIC' value='${hg38DatabasesDirectory}/COSMIC/cosmic_v81/Cosmic_v81_hg38_all.coding_snvs.bed.gz:7,8,9:1' type="path"/>
+      <cvalue name='ENCODE_DNASE' value='${hg38DatabaseUCSCDirectory}/DNAase/2019-08-15_Encode_DNAase_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
+      <cvalue name='ENCODE_TFBS' value='${hg38DatabaseUCSCDirectory}/TF_clusters/2019-08-15_Encode_TF_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
+    </configurationValueBundle>
+  </configurationvalues>
+</configuration>
\ No newline at end of file

From 90b99785600907f1e809bc6661d487cf8b3bf66f Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 11 Mar 2021 10:36:18 +0100
Subject: [PATCH 14/49] chrLength file with 'chr' prefix

---
 .../snvPipeline/intermutationDistance_Coord_color.r      | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r b/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r
index e00938f..67ec8b3 100755
--- a/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r
+++ b/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r
@@ -45,6 +45,11 @@ c.array <- unlist(strsplit(opt$chrArray, ',', fixed = TRUE))
 data <- read.table(opt$chrLengthFile, header=FALSE)
 chrLength <- data.frame(data)
 rownames(chrLength) <- chrLength$V1
+
+if((opt$chrPrefix != "" && grepl(opt$chrPrefix, chrLength$V1[1])) && !grepl(opt$chrPrefix, c.array[1])){
+	c.array <- paste0(opt$chrPrefix, c.array, opt$chrSuffix)
+}
+
 xtotal <- sum(chrLength[c.array,2]/10)
 xoffsets <- c(0)
 
@@ -84,7 +89,7 @@ pdf(opt$outFilePrefix, width=20, height=8)
 plotColors = c("CA"="blue","CG"="black","CT"="red","TA"="purple","TC"="orange","TG"="green")
 baseChanges <- c("CA", "CG", "CT", "TA", "TC","TG")
 
-c.name <- paste0(opt$chrPrefix, c.array[1], opt$chrSuffix)  
+c.name <- c.array[1]
 
 sel <- which(dat$diff > 0)
 if(length(sel)){
@@ -101,7 +106,7 @@ if(length(sel)){
 	# chromosomes <- c(2:22, 'X')
 
 	for (chr in c.array) {
-	  c.name <- paste0(opt$chrPrefix, chr, opt$chrSuffix)  
+	  c.name <- chr
 
 	  sel <- which(dat$X.CHROM == c.name & dat$diff > 0)
 	  points((dat$POS[sel]/10+xoffset)/xtotal, log10(dat$diff[sel]), col = plotColors[dat$change[sel]], pch=16,cex=0.8)

From 58b77cd1dd925d1ad8fe65c4e74de9c0d0300f98 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Tue, 16 Mar 2021 09:55:28 +0100
Subject: [PATCH 15/49] Removing hard-coded header parsing

---
 .../snvPipeline/in_dbSNPcounter.pl            | 53 +++++--------------
 .../snvPipeline/snv_extractor_v1.pl           | 15 +++---
 2 files changed, 21 insertions(+), 47 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
index cbf0ef3..737ae04 100755
--- a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
+++ b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
@@ -43,49 +43,24 @@
 }
 chomp $header;
 
-my $DBSBP = 13;
-my $KG = 14;
-my $CONFIDENCE = 28;
-my $CANNOTATION = -1;
-my $RECLASSIFICATION = -1;
-my $CLASSIFICATION;
+my @head=split("\t", $header);
+my %col;
 
-@help = split(/\t/, $header);
-for (my $c = 0; $c < @help; $c++)
+my $i = 0;
+while($i < @head)
 {
-	# fixed column labels (from vcf_pileup_compare)
-	if ($help[$c] eq "CONFIDENCE")
-	{
-		$CONFIDENCE = $c;
-		print STDERR "CONFIDENCE in column $c\n";
-	}
-	if ($help[$c] eq "1K_GENOMES")
-	{
-		$KG = $c;
-		print STDERR "INFO in column $c\n";
-	}
-	if ($help[$c] eq "DBSNP")
-	{
-		$DBSBP = $c;
-		print STDERR "DBSNP_COL in column $c\n";
-	}
-	if ($help[$c] eq "ANNOTATION_control")
-    {
-    	$CANNOTATION = $c;
-    	print STDERR "ANNOTATION_control in column $c\n";
-    }
-    if ($help[$c] eq "RECLASSIFICATION")
-    {
-        $RECLASSIFICATION = $c;
-        print STDERR "RECLASSIFICATION in column $c\n";
-    }
+	if($head[$i] eq "DBSNP"){$col{"DBSNP"} = $i; print STDERR "DBSNP in column ", $i+1,"\n";}
+	if($head[$i] eq "1K_GENOMES"){$col{"1K_GENOMES"} = $i; print STDERR "1K_GENOMES in column ", $i+1,"\n";}
+	if($head[$i] eq "RECLASSIFICATION"){$col{"RECLASSIFICATION"} = $i; print STDERR "RECLASSIFICATION in column ", $i+1,"\n";}
+	if($head[$i] eq "ANNOTATION_control"){$col{"ANNOTATION_control"} = $i; print STDERR "ANNOTATION_control in column ", $i+1,"\n";}
+	if($head[$i] eq "CONFIDENCE"){$col{"CONFIDENCE"} = $i; print STDERR "CONFIDENCE in column ", $i+1,"\n";}
+	$i++;
 }
 
-if ($CANNOTATION > 0) {
-    $CLASSIFICATION = $CANNOTATION;
-} else {
-    $CLASSIFICATION = $RECLASSIFICATION;
-}
+my $DBSBP = $col{"DBSNP"};
+my $KG = $col{"1K_GENOMES"};
+my $CONFIDENCE = $col{"CONFIDENCE"};
+my $CLASSIFICATION = $col{"RECLASSIFICATION"};
 
 # I know that it's evilly hardcoded!
 while (<FH>)
diff --git a/resources/analysisTools/snvPipeline/snv_extractor_v1.pl b/resources/analysisTools/snvPipeline/snv_extractor_v1.pl
index cea35fa..d880b2e 100755
--- a/resources/analysisTools/snvPipeline/snv_extractor_v1.pl
+++ b/resources/analysisTools/snvPipeline/snv_extractor_v1.pl
@@ -64,18 +64,17 @@
 print STDERR "NOTE: The reading of the VCF will end with a broken pipe error, because we only read the header and then stop.\n";
 close IN;
 
-my @head=split("\t", $head);
+my @header=split("\t", $head);
 my %col;
 
-
 my $i = 0;
-while($i < @head)
+while($i < @header)
 {
-	if($head[$i] eq "EXONIC_CLASSIFICATION"){$col{"EXONIC_CLASSIFICATION"} = $i; print "EXONIC_CLASSIFICATION in column ", $i+1,"\n";}
-	if($head[$i] eq "ANNOVAR_FUNCTION"){$col{"ANNOVAR_FUNCTION"} = $i; print "ANNOVAR_FUNCTION in column ", $i+1,"\n";}
-	if($head[$i] eq "RECLASSIFICATION"){$col{"RECLASSIFICATION"} = $i; print "RECLASSIFICATION in column ", $i+1,"\n";}
-	if($head[$i] eq "ANNOTATION_control"){$col{"ANNOTATION_control"} = $i; print "ANNOTATION_control in column ", $i+1,"\n";}
-	if($head[$i] eq "CONFIDENCE"){$col{"CONFIDENCE"} = $i; print "CONFIDENCE in column ", $i+1,"\n";}
+	if($header[$i] eq "EXONIC_CLASSIFICATION"){$col{"EXONIC_CLASSIFICATION"} = $i; print "EXONIC_CLASSIFICATION in column ", $i+1,"\n";}
+	if($header[$i] eq "ANNOVAR_FUNCTION"){$col{"ANNOVAR_FUNCTION"} = $i; print "ANNOVAR_FUNCTION in column ", $i+1,"\n";}
+	if($header[$i] eq "RECLASSIFICATION"){$col{"RECLASSIFICATION"} = $i; print "RECLASSIFICATION in column ", $i+1,"\n";}
+	if($header[$i] eq "ANNOTATION_control"){$col{"ANNOTATION_control"} = $i; print "ANNOTATION_control in column ", $i+1,"\n";}
+	if($header[$i] eq "CONFIDENCE"){$col{"CONFIDENCE"} = $i; print "CONFIDENCE in column ", $i+1,"\n";}
 	$i++;
 }
 

From 1af4b686b575e8c2230ed09bebadec6355c92e77 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Tue, 16 Mar 2021 10:01:09 +0100
Subject: [PATCH 16/49] Liftover local control for hg38 WES and WGS

---
 .../snvPipeline/confidenceAnnotation_SNVs.py  | 21 ++++++++++++-------
 .../analysisTools/snvPipeline/filter_vcf.sh   |  3 ++-
 .../snvPipeline/snvAnnotation.sh              |  7 ++++---
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index 6c8d22c..88e09f3 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -107,7 +107,8 @@ def main(args):
             if args.no_control:
                 variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$"
                 variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$"
-                variable_headers["LOCALCONTROL_COL"] = "^LocalControlAF$"
+                variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$"
+                variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$"                
             else:
                 fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ]
 
@@ -187,7 +188,8 @@ def main(args):
             classification = "somatic" # start with default somatic
             inGnomAD_WES = False
             inGnomAD_WGS = False
-            inLocalControl = False
+            inLocalControl_WES = False
+            inLocalControl_WGS = False
         else:
             # for potential re-classification (e.g. low coverage in control and in dbSNP => probably germline)
             classification = help["ANNOTATION_control"] # start with original classification
@@ -239,10 +241,15 @@ def main(args):
                 infofield["gnomAD_Genomes"] = "gnomAD_Genomes"
                 reasons += "gnomAD_Genomes(NoControl)"
 
-            if help["LOCALCONTROL_COL_VALID"] and any(af > 0.02 for af in map(float, extract_info(help["LOCALCONTROL_COL"], "AF").split(','))):
-                inLocalControl = True
-                infofield["LocalControl"] = "LocalControl"
-                reasons += "LocalControl(NoControl)"
+            if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))):
+                inLocalControl_WGS = True
+                infofield["LocalControl_WGS"] = "LocalControl_WGS"
+                reasons += "LocalControl_WGS(NoControl)"
+            if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))):
+                inLocalControl_WES = True
+                infofield["LocalControl_WES"] = "LocalControl_WES"
+                reasons += "LocalControl_WES(NoControl)"
+
 
         # Punish for biases round 1
         if idx_pcrbias != -1 and idx_seqbias != -1 and args.round == 1:
@@ -391,7 +398,7 @@ def main(args):
 
         if (args.no_control):
             # an SNV that is in dbSNP but not "clinic" or/and in 1 KG with high frequency is probably germline
-            if (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl):
+            if (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS or inLocalControl_WES):
                classification = "SNP_support_germline"
 
         # 3) information from the calls and germline comparisons: coverage, strand bias, variant support, ..
diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh
index dc71e1e..0e8f226 100755
--- a/resources/analysisTools/snvPipeline/filter_vcf.sh
+++ b/resources/analysisTools/snvPipeline/filter_vcf.sh
@@ -40,7 +40,8 @@ if [[ ${isNoControlWorkflow-false} == "true" ]]; then
     #[[ ${FILTER_1KGENOMES} == 'true' ]]     && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AFR_AF ${CRIT_1KGENOMES_maxMAF}+"
     [[ ${FILTER_1KGENOMES} == 'true' ]]     && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} EUR_AF ${CRIT_1KGENOMES_maxMAF}+"
     [[ ${FILTER_NON_CLINIC} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${DBSNP_COL} CLN,COMMON nonexist,exist"
-    [[ ${FILTER_LOCALCONTROL} == 'true' ]]  && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+"
+    [[ ${FILTER_LOCALCONTROL} == 'true' ]]  && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WGS_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+"
+    [[ ${FILTER_LOCALCONTROL} == 'true' ]]  && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WES_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+"
     [[ ${FILTER_RECURRENCE} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${RECURRENCE_COL} . ${CRIT_RECURRENCE}+"
 
     if [[ ${FILTER_VALUES} != "" ]]; then
diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh
index 4400766..575e50e 100755
--- a/resources/analysisTools/snvPipeline/snvAnnotation.sh
+++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh
@@ -91,9 +91,10 @@ fi
 
 cmdFilter="${cmdFilter} | ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${DBSNP} --columnName=${DBSNP_COL} --reportMatchType  --bAdditionalColumn=2 --reportLevel 4 | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${KGENOME} --columnName=${KGENOMES_COL}  --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \
-    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_ALL_SNV} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \
-    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_ALL_SNV} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \
-    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL} --columnName ${LOCALCONTROL_COL} --bFileType vcf --reportMatchType --reportLevel 4 "
+    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${GNOMAD_WES_ALL_SNV} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \
+    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${GNOMAD_WGS_ALL_SNV} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \
+    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL_WGS} --columnName ${LOCALCONTROL_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \
+    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL_WES} --columnName ${LOCALCONTROL_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 "
 
 if [[ -f ${RECURRENCE} ]]; then
     cmdFilter="${cmdFilter} | ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${RECURRENCE} --columnName ${RECURRENCE_COL} --bFileType vcf"

From baabd823301d9ed32f545b2e705ba8f22e86d1f7 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Tue, 16 Mar 2021 10:03:25 +0100
Subject: [PATCH 17/49] hg19 specific annotations

---
 .../snvPipeline/confidenceAnnotation_SNVs.py  | 53 +++++++++++--------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index 88e09f3..4ffe53d 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -32,7 +32,15 @@ def extract_info(info, keys, sep=";"):
         rtn = '0' if rtn == "None" else rtn
         return rtn
 
+def validate_refgenome(refname):
+    valid_refgenome = ('hs37d5', 'GRCh38')
+    if refname not in valid_refgenome:
+        raise ValueError('Reference name (--refgenome) is not valid: %s. Valid reference genome names are %s' % (refname, ', '.join(valid_refgenome)))
+
 def main(args):
+
+    validate_refgenome(args.refgenome[0])
+
     if args.pancanout is not None and not args.no_makehead:
         header = '##fileformat=VCFv4.1\n' \
                  '##fileDate=' + time.strftime("%Y%m%d") + '\n' \
@@ -102,6 +110,13 @@ def main(args):
                              "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$",
                              "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$",
                              "^seqBiasPresent_2$", "^seqingBiasPresent_2$"]
+            
+            if args.refgenome[0] == "hs37d5":
+                fixed_headers = ["^INFO$", "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", "DUKE_EXCLUDED",
+                                 "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$",
+                                 "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$",
+                                 "^seqBiasPresent_2$", "^seqingBiasPresent_2$"]
+
             variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", }
 
             if args.no_control:
@@ -327,27 +342,23 @@ def main(args):
             reasons += "Segmental_dup(-2)"
             filterfield["RE"] = 1
 
-        # Encode blacklist, generated for hg38 mappings
-#        if help["ENCODE_BLACKLIST_VALID"]:
-#            confidence -= 3 # High signal region and low mappability
-#            is_weird = True
-#            reasons += "Blacklist(-3)"
-#            filterfield["BL"] = 1
-
-        # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat
-        #if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]:
-        #    confidence -= 3 # really bad region, usually centromeric repeats
-        #    is_weird = True
-        #    reasons += "Blacklist(-3)"
-        #    filterfield["BL"] = 1
-
-        # HiSeqDepth: regions "attracting" reads; often coincide with tandem repeats and CEN/TEL,
-        # not always with low mapability
-        #if help["HISEQDEPTH_VALID"]:
-        #    confidence -= 3 # really really bad region!
-        #    is_weird = True
-        #    reasons += "Hiseqdepth(-3)"
-        #    filterfield["HSDEPTH"] = 1
+        # Only for hg19 reference genome
+        if args.refgenome[0] == "hs37d5":
+
+            # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat
+            if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]:
+                confidence -= 3 # really bad region, usually centromeric repeats
+                is_weird = True
+                reasons += "Blacklist(-3)"
+                filterfield["BL"] = 1
+
+            # HiSeqDepth: regions "attracting" reads; often coincide with tandem repeats and CEN/TEL,
+            # not always with low mapability
+            if help["HISEQDEPTH_VALID"]:
+                confidence -= 3 # really really bad region!
+                is_weird = True
+                reasons += "Hiseqdepth(-3)"
+                filterfield["HSDEPTH"] = 1
 
         # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ...
         # Everything with really high number of occurences is artefacts

From 5a764ef4c765aafddeb2f2d28490d50a4ab4a9e1 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Tue, 16 Mar 2021 10:06:35 +0100
Subject: [PATCH 18/49] User defined threshold-based 'SNP_support_germline'
 annotations

---
 .../snvPipeline/confidenceAnnotation_SNVs.py    | 11 ++++++++---
 .../analysisTools/snvPipeline/snvAnnotation.sh  | 17 +++++++++++------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index 4ffe53d..dd2c281 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -225,7 +225,7 @@ def main(args):
             in1KG = True
             if args.no_control:
                 af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF")
-                if af is not None and any(af > 0.01 for af in map(float, af.split(','))) > 0.01:
+                if af is not None and any(af > args.kgenome_maxMAF for af in map(float, af.split(','))):
                     in1KG_AF = True
             infofield["1000G"] = "1000G"
         # dbSNP
@@ -247,11 +247,11 @@ def main(args):
         if args.no_control:
             if indbSNP and is_commonSNP and not is_clinic:
                 reasons += "dbSNP(NoControl)"
-            if help["GNOMAD_EXOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))):
+            if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))):
                 inGnomAD_WES = True
                 infofield["gnomAD_Exomes"] = "gnomAD_Exomes"
                 reasons += "gnomAD_Exomes(NoControl)"
-            if help["GNOMAD_GENOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))):
+            if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))):
                 inGnomAD_WGS = True
                 infofield["gnomAD_Genomes"] = "gnomAD_Genomes"
                 reasons += "gnomAD_Genomes(NoControl)"
@@ -800,5 +800,10 @@ def main(args):
     parser.add_argument("-x", "--runexome", dest="runexome", action="store_true", default=False,
                         help="Run on exome, will turn off the high control coverage punishment " \
                              "and the PCR bias filter.")
+    parser.add_argument("--gnomAD_WGS_maxMAF", dest="gnomAD_WGS_maxMAF", help="Max gnomAD WGS MAF", default=0.001, type=float)
+    parser.add_argument("--gnomAD_WES_maxMAF", dest="gnomAD_WES_maxMAF", help="Max gnomAD WES MAF", default=0.001, type=float)
+    parser.add_argument("--localControl_WGS_maxMAF", dest="localControl_WGS_maxMAF", help="Max local control WGS MAF", default=0.01, type=float)
+    parser.add_argument("--localControl_WES_maxMAF", dest="localControl_WES_maxMAF", help="Max local control WES MAF", default=0.01, type=float)
+    parser.add_argument("--1000genome_maxMAF", dest="kgenome_maxMAF", help="Max 1000 genome MAF", default=0.01, type=float)
     args = parser.parse_args()
     main(args)
diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh
index 575e50e..713a5bc 100755
--- a/resources/analysisTools/snvPipeline/snvAnnotation.sh
+++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh
@@ -197,7 +197,8 @@ mkfifo ${npConfidence}
 if [[ ${isNoControlWorkflow-false} == "false" ]]; then
     cat ${filenameSNVVCF} > ${npConfidence} &
 else
-    cat ${filenameSNVVCF} | ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 > ${npConfidence} &
+    cat ${filenameSNVVCF} | ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 \
+        --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${npConfidence} &
 fi
 
 # create BaseScore FIFOs and their consumer processes (zip and write to target file)
@@ -212,14 +213,15 @@ cat ${filenameReferenceAlleleReadPositions}_NP | ${BGZIP_BINARY} -f >${filenameR
 if [[ ${runArtifactFilter-true} == true ]]
 then
 	cat ${npConfidence} | filterPeOverlap ${noControlFlag} --alignmentFile=${tumorbamfullpath} --mapq=$mapqual --baseq=$basequal --qualityScore=phred --maxNumberOfMismatchesInRead=${NUMBER_OF_MISMACTHES_THRESHOLD--1} --altBaseQualFile=${filenameAlternativeAlleleBaseScores}_NP --refBaseQualFile=${filenameReferenceAlleleBaseScores}_NP --altBasePositionsFile=${filenameAlternativeAlleleReadPositions}_NP --refBasePositionsFile ${filenameReferenceAlleleReadPositions}_NP --referenceFile=${REFERENCE_GENOME} \
-						| ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 -f ${filenameSomaticSNVsTmp} > ${filenameSNVVCFTemp}.tmp
+						| ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 -f ${filenameSomaticSNVsTmp} \
+                            --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCFTemp}.tmp
 
     [[ $? != 0 ]] && echo "Error in first iteration of confidence annotation" && exit 2
 
 	NRSOMSNV=`grep -v "^#" ${filenameSomaticSNVsTmp} | wc -l`
 	echo -e "SOMATIC_SNVS_UNFILTERED\t${NRSOMSNV}">> ${filenameQCValues}
 
-	mv ${filenameSNVVCFTemp}.tmp ${filenameSNVVCFTemp}
+	mv ${filenameSNVVCFTemp}.tmp ${filenameSNVVCFTemp}    
 
     wait ${zipAlternativeAlleleBaseScores} ; [[ $? -gt 0 ]] && echo "Error from zipAlternativeAlleleBaseScores" && exit 31
     wait ${zipReferenceAlleleBaseScores} ; [[ $? -gt 0 ]] && echo "Error from zipReferenceAlleleBaseScores" && exit 32
@@ -241,7 +243,8 @@ then
     fi
 
 	cat ${filenameSNVVCFTemp} | ${PYTHON_BINARY} ${TOOL_FLAG_BIAS} --vcfFile="/dev/stdin" --referenceFile=${REFERENCE_GENOME} --sequence_specificFile=${filenamePCRerrorMatrix} --sequencing_specificFile=${filenameSequencingErrorMatrix} --numReads=${nReads} --numMuts=${nMuts} --biasPValThreshold=${biasPValThreshold} --biasRatioThreshold=${biasRatioThreshold} --biasRatioMinimum=${biasRatioMinimum} --maxNumOppositeReadsSequencingWeakBias=${maxNumOppositeReadsSequencingWeakBias} --maxNumOppositeReadsSequenceWeakBias=${maxNumOppositeReadsSequenceWeakBias} --maxNumOppositeReadsSequencingStrongBias=${maxNumOppositeReadsSequencingStrongBias} --maxNumOppositeReadsSequenceStrongBias=${maxNumOppositeReadsSequenceStrongBias} --ratioVcf=${rVcf} --bias_matrixSeqFile=${filenameBiasMatrixSeqFile} --bias_matrixSeqingFile=${filenameBiasMatrixSeqingFile} --vcfFileFlagged="/dev/stdout" | \
-	${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 1 -f ${filenameSomaticSNVsTmp} > ${filenameSNVVCFTemp}.tmp
+	${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 1 -f ${filenameSomaticSNVsTmp} \
+        --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCFTemp}.tmp
 
 	[[ $? != 0 ]] && echo "Error in first filtering and/or second interation of confidence annotation" && exit 5
 
@@ -265,7 +268,8 @@ then
     fi
 
 	cat ${filenameSNVVCFTemp} | ${PYTHON_BINARY} -u ${TOOL_FLAG_BIAS} --vcfFile="/dev/stdin" --referenceFile=${REFERENCE_GENOME} --sequence_specificFile=${filenamePCRerrorMatrix} --sequencing_specificFile=${filenameSequencingErrorMatrix} --numReads=${nReads} --numMuts=${nMuts} --biasPValThreshold=${biasPValThreshold} --biasRatioThreshold=${biasRatioThreshold} --biasRatioMinimum=${biasRatioMinimum} --maxNumOppositeReadsSequencingWeakBias=${maxNumOppositeReadsSequencingWeakBias} --maxNumOppositeReadsSequenceWeakBias=${maxNumOppositeReadsSequenceWeakBias} --maxNumOppositeReadsSequencingStrongBias=${maxNumOppositeReadsSequencingStrongBias} --maxNumOppositeReadsSequenceStrongBias=${maxNumOppositeReadsSequenceStrongBias} --ratioVcf=${rVcf} --bias_matrixSeqFile=${filenameBiasMatrixSeqFile} --bias_matrixSeqingFile=${filenameBiasMatrixSeqingFile} --vcfFileFlagged="/dev/stdout" | \
-	${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} -a 2  > ${filenameSNVVCF}
+	${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} -a 2 \
+    --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCF}
 
 	[[ $? != 0 ]] && echo "Error in second filtering and/or third iteration of confidence annotation" && exit 8
 
@@ -287,7 +291,8 @@ then
 	fi
 else
 	cat ${npConfidence} | filterPeOverlap ${noControlFlag} --alignmentFile=${tumorbamfullpath} --mapq=$mapqual --baseq=$basequal --qualityScore=phred --maxNumberOfMismatchesInRead=${NUMBER_OF_MISMACTHES_THRESHOLD--1} --altBaseQualFile=${filenameAlternativeAlleleBaseScores}_NP --refBaseQualFile=${filenameReferenceAlleleBaseScores}_NP --altBasePositionsFile=${filenameAlternativeAlleleReadPositions}_NP --refBasePositionsFile=${filenameReferenceAlleleReadPositions}_NP --referenceFile=${REFERENCE_GENOME} | \
-	${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} > ${filenameSNVVCFTemp}
+	${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} \
+    --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCFTemp}
 
     exitCode=$?
     [[ $exitCode == 0 ]] && [[ -f ${filenameSNVVCFTemp} ]] && mv ${filenameSNVVCFTemp} ${filenameSNVVCF}

From da889407fd8d967d54549759670575a9ac7548af Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Tue, 16 Mar 2021 10:36:17 +0100
Subject: [PATCH 19/49] Removed extra spaces

---
 buildversion.txt                              |  2 +-
 .../snvPipeline/confidenceAnnotation_SNVs.py  |  4 +-
 .../snvPipeline/filter_PEoverlap.py           | 72 +++++++++----------
 .../snvPipeline/snvAnnotation.sh              |  2 +-
 .../configurationFiles/analysisSNVCalling.xml | 10 +--
 .../analysisSNVCallingGRCh38.xml              | 14 ++--
 6 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/buildversion.txt b/buildversion.txt
index e94ba2a..228b97b 100755
--- a/buildversion.txt
+++ b/buildversion.txt
@@ -1,2 +1,2 @@
-2.0
+2.1
 0
diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index dd2c281..22be339 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -110,7 +110,7 @@ def main(args):
                              "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$",
                              "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$",
                              "^seqBiasPresent_2$", "^seqingBiasPresent_2$"]
-            
+
             if args.refgenome[0] == "hs37d5":
                 fixed_headers = ["^INFO$", "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", "DUKE_EXCLUDED",
                                  "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$",
@@ -123,7 +123,7 @@ def main(args):
                 variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$"
                 variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$"
                 variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$"
-                variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$"                
+                variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$"
             else:
                 fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ]
 
diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
index f0cabc3..9b7fd57 100755
--- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py
+++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
@@ -134,11 +134,11 @@ def performAnalysis(args):
     mode = "r"
     multiple_iterators = False
     if args.alignmentFile.split(".")[-1] == "bam":
-        mode += "b"        
+        mode += "b"
         samfile = pysam.Samfile(args.alignmentFile, mode)
     elif args.alignmentFile.split(".")[-1] == "cram":
-        mode += "c"        
-        samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file)        
+        mode += "c"
+        samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file)
 
     if args.altPosF != '':
         ALT_basePositions_file = args.altPosF
@@ -177,7 +177,7 @@ def performAnalysis(args):
         REF_baseQualities=[]
         ALT_baseQualities=[]
 
-        # how to treat multiallelic SNVs? Skipped in this current version...                
+        # how to treat multiallelic SNVs? Skipped in this current version...
         if ((args.no_control and int(parsed_line["CONFIDENCE"]) > 7 and "somatic" in parsed_line["RECLASSIFICATION"]) or (not args.no_control and "somatic" in parsed_line["ANNOTATION_control"])) and len(parsed_line["ALT"]) == 1:
             # DP=13;AF1=0.5;AC1=1;DP4=2,3,3,4;MQ=37;FQ=75;PV4=1,1,1,1
             info_values = parsed_line["INFO"].split(';')
@@ -187,14 +187,14 @@ def performAnalysis(args):
                     DP4 = map(int, info_value[4:].split(','))
                     DP4rf, DP4rr, DP4af, DP4ar = DP4
                     DP4_original = re.sub('DP4', 'DP4original', info_value) # Keeping a backup of original DP4
-                    DP4_original_alt = DP4af + DP4ar                    
+                    DP4_original_alt = DP4af + DP4ar
                     break
 
             chrom=parsed_line["CHROM"]
             pos=int(parsed_line["POS"])
             REF=parsed_line["REF"]
             ALT=parsed_line["ALT"]
-            
+
             readNameHash={}
             readMateHash={} # Hash to store read and mate starting positions for duplicate marking
             readMateHash_qnameLocation={} # Hash to store the location of gname in the above hash list
@@ -202,22 +202,22 @@ def performAnalysis(args):
 
             ACGTNacgtn1 = [0]*10
             ACGTNacgtn2 = [0]*10
-            count_PE = BoolCounter(REF, ALT) # Starting the counter for the forward and reverse reads removed due to PE overlap detection  
+            count_PE = BoolCounter(REF, ALT) # Starting the counter for the forward and reverse reads removed due to PE overlap detection
             count_supple = BoolCounter(REF, ALT) # "" for supplementary reads, since flag_filter is added, entire supplementary detection can be removed in future versions
             count_mismatch = BoolCounter(REF, ALT) # " for mismatch report
             count_nonREFnonALT = BoolCounter(REF, ALT) # " for nonREF and nonALT bases
 
             # To match pysam and mpileup counts, a reference file is added. Given the reference file, Pysam by default computes BAQ (compute_baq).
             for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=3844, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators):
-                if pileupcolumn.pos == (pos-1):                	
-                    #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments)                    
-                    for pileupread in pileupcolumn.pileups:                    	
-                        if pileupread.is_del:                            
-                            # 31 May 2016 JB: deletion at the pileup position                            
-                            continue                        
+                if pileupcolumn.pos == (pos-1):
+                    #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments)
+                    for pileupread in pileupcolumn.pileups:
+                        if pileupread.is_del:
+                            # 31 May 2016 JB: deletion at the pileup position
+                            continue
                         baseScore = transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0]
                         readpos = pileupread.query_position
-                        if pileupread.alignment.seq[pileupread.query_position].lower()  == ALT.lower():                            
+                        if pileupread.alignment.seq[pileupread.query_position].lower()  == ALT.lower():
                             if args.altBQF != '':
                                 ALT_baseQualities.append(baseScore)
                             if args.altPosF != '':
@@ -237,7 +237,7 @@ def performAnalysis(args):
                         if pileupread.alignment.mapq >= args.mapq:
                             # http://wwwfgu.anat.ox.ac.uk/~andreas/documentation/samtools/api.html   USE qqual
                             try:
-                                if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq:                                    
+                                if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq:
                                     # check if we consider this read as a proper read in terms of number of mismatches
                                     if args.allowedNumberOfMismatches > -1:
                                         numberOfMismatches = None
@@ -246,14 +246,14 @@ def performAnalysis(args):
                                                 numberOfMismatches = tag[1]
                                                 break 
                                             else:
-                                                continue                                    
-                                        
+                                                continue
+
                                         if numberOfMismatches is not None:
                                             if numberOfMismatches > args.allowedNumberOfMismatches:
                                                 remove_base = pileupread.alignment.seq[pileupread.query_position]
                                                 remove_is_reverse = pileupread.alignment.is_reverse
                                                 count_mismatch.update(remove_base)
-                                                (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)                                         
+                                                (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)
                                                 # after decreasing the respective DP4 value, go directly to the next read
                                                 # without remembering the current read
                                                 # This will lead to an unknown read name when the paired read occurs at the same
@@ -261,10 +261,10 @@ def performAnalysis(args):
                                                 # have to decrease DP4 values again, when the read partner occurs at the same SNV.
                                                 # We also do not increase ANCGTNacgtn for the discarded read.
                                                 continue 
-                                                                            
+
                                     # Check if pileupread.alignment is proper pair
                                     if(pileupread.alignment.is_proper_pair):
-                                        # count to ACGTNacgtn list                                        
+                                        # count to ACGTNacgtn list
                                         is_reverse = pileupread.alignment.is_reverse
                                         is_read1 = pileupread.alignment.is_read1
                                         base = pileupread.alignment.seq[pileupread.query_position].lower()
@@ -332,17 +332,17 @@ def performAnalysis(args):
                                             	readMateHash[read_mate_tuple] = []
                                             	readMateHash[read_mate_tuple].append(read_mate_tuple_value)
 
-                                            readMateHash_qnameLocation[pileupread.alignment.query_name] = len(readMateHash[read_mate_tuple]) - 1 # Location of the last pushed element in the array                                            
+                                            readMateHash_qnameLocation[pileupread.alignment.query_name] = len(readMateHash[read_mate_tuple]) - 1 # Location of the last pushed element in the array
 
                             except IndexError:
                                 "soft-clipped or trimmed base, not part of the high-qual alignemnt anyways, skip"
 
                             if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq:
-                            
+
                                 if pileupread.alignment.seq[pileupread.query_position] == ALT:
-                                    ALTcount += 1                                
+                                    ALTcount += 1
                                 # samtools mpileup sometimes counts bases as variants which are neither REF nor ALT
-                                if (pileupread.alignment.seq[pileupread.query_position] != REF) and (pileupread.alignment.seq[pileupread.query_position] != ALT):                                    
+                                if (pileupread.alignment.seq[pileupread.query_position] != REF) and (pileupread.alignment.seq[pileupread.query_position] != ALT):
                                     if pileupread.alignment.is_reverse:                                    	
                                         nonREFnonALTrev += 1
                                         #if DP4ar > 0: DP4ar -= 1
@@ -384,11 +384,11 @@ def performAnalysis(args):
             # Calculating duplicates based on read-mate pair's start positions (chr id and start location)
             count_duplicate = BoolCounter(REF, ALT)
             for key in readMateHash:
-            	value_length = len(readMateHash[key])            	            	
-            	if value_length > 0:            		
+            	value_length = len(readMateHash[key])
+            	if value_length > 0:
             	    sorted_values = sorted(readMateHash[key], key=lambda x: x[0]) # Sorted based on base quality
                     sorted_values = sorted_values[:-1] # removing the read with highest quality, so it will be retained for count
-                    for value in sorted_values:        # Removing everthing else                            
+                    for value in sorted_values:        # Removing everthing else
                         qual_value, decreaseInfo = value
                         remove_base = decreaseInfo[0]
                         remove_is_reverse = decreaseInfo[1]
@@ -396,32 +396,32 @@ def performAnalysis(args):
                         count_duplicate.update(remove_base)
                         (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)
 
-            if (DP4[2] + DP4[3]) > ALTcount:    # that the ALTcount is larger  happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling                        
-                if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd                
+            if (DP4[2] + DP4[3]) > ALTcount:    # that the ALTcount is larger  happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling
+                if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd
 
                 if DP4ar >= nonREFnonALTrev: DP4ar -= nonREFnonALTrev
-                
+
                 nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd
                 count_nonREFnonALT.update_nonREFnonALT(nonREFnonALT)
-            
+
             #Format: DP2 -> "reference(forward + reverse), alt(forward + reverse)"
             supple_dup_str = 'DP2sup=' + ','.join(map(str, count_supple.counted()))
             supple_dup_str += ';DP2dup=' + ','.join(map(str, count_duplicate.counted()))
             supple_dup_str += ';DP2pairEnd=' + ','.join(map(str, count_PE.counted()))
             supple_dup_str += ';DP2mis=' + ','.join(map(str, count_mismatch.counted()))
             supple_dup_str += ';DP2nonREFnonALT=' + ','.join(map(str, count_nonREFnonALT.counted()))
-            
+
             ACGTNacgtn1_string = "ACGTNacgtnPLUS="+",".join([str(i) for i in ACGTNacgtn1])
             ACGTNacgtn2_string = "ACGTNacgtnMINUS="+",".join([str(i) for i in ACGTNacgtn2])
 
-            info_values[DP4_idx] = "DP4=" + str(DP4rf)+ "," + str(DP4rr)+ "," + str(DP4af)+ "," + str(DP4ar)            
+            info_values[DP4_idx] = "DP4=" + str(DP4rf)+ "," + str(DP4rr)+ "," + str(DP4af)+ "," + str(DP4ar)
             info_values.append(ACGTNacgtn1_string)
             info_values.append(ACGTNacgtn2_string)
             info_values.append(DP4_original)
-            info_values.append(supple_dup_str)            
+            info_values.append(supple_dup_str)
 
             entries[header_indices["INFO"]] = ';'.join(info_values)
-            
+
             sys.stdout.write('\t'.join(entries) + '\n')
         else:
             sys.stdout.write(line)   # write germline and somatic-multiallelic SNVs as is
@@ -446,7 +446,7 @@ def performAnalysis(args):
 
     #vcfInFile.close()
     #outFile.close()
-    
+
 if __name__ == '__main__':
     #print "Starting program...\n" 
     import argparse
diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh
index 713a5bc..856ef35 100755
--- a/resources/analysisTools/snvPipeline/snvAnnotation.sh
+++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh
@@ -221,7 +221,7 @@ then
 	NRSOMSNV=`grep -v "^#" ${filenameSomaticSNVsTmp} | wc -l`
 	echo -e "SOMATIC_SNVS_UNFILTERED\t${NRSOMSNV}">> ${filenameQCValues}
 
-	mv ${filenameSNVVCFTemp}.tmp ${filenameSNVVCFTemp}    
+	mv ${filenameSNVVCFTemp}.tmp ${filenameSNVVCFTemp}
 
     wait ${zipAlternativeAlleleBaseScores} ; [[ $? -gt 0 ]] && echo "Error from zipAlternativeAlleleBaseScores" && exit 31
     wait ${zipReferenceAlleleBaseScores} ; [[ $? -gt 0 ]] && echo "Error from zipReferenceAlleleBaseScores" && exit 32
diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
index 6049fab..61e4bf7 100755
--- a/resources/configurationFiles/analysisSNVCalling.xml
+++ b/resources/configurationFiles/analysisSNVCalling.xml
@@ -39,15 +39,15 @@
         <cvalue name='SNVFILE_PREFIX' value='snvs_' type="string"/>
         <!--<cvalue name='analysisMethodNameOnInput' value='whole_genome_analysis' type='string' />-->
         <cvalue name="RAW_SNV_FILTER_OPTIONS" value='" --minVac=3 --minVaf=0.03 --minVacPS=2"' type="string"/>
-        
+
 
     	<!-- Reference genomes -->
         <cvalue name='REFERENCE_GENOME' value="${referenceGenome_hg19_chr}" type="string"/>
-        
-        <!--## Databases -->        
-        <cvalue name='hg19DatabasesDirectory' value="/icgc/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases" type="path" />        
+
+        <!--## Databases -->
+        <cvalue name='hg19DatabasesDirectory' value="/icgc/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases" type="path" />
         <cvalue name="hg19DatabaseUCSCDirectory" value="/icgc/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases/UCSC" type="path" />
-        
+
         <cvalue name="DBSNP" value="${hg19DatabasesDirectory}/dbSNP/dbSNP_147/00-All.SNV.vcf.gz" type="path"/>
         <cvalue name="DBSNP_COL" value="DBSNP" type="string"/>
         <cvalue name="KGENOME" value="${hg19DatabasesDirectory}/1000genomes/ALL.wgs.phase1_integrated_calls.20101123.snps_chr.vcf.gz" type="path"/>
diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
index 77711f8..d8827b2 100644
--- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
@@ -10,8 +10,8 @@
   <configurationvalues>
 
     <!--hg38 base directory-->
-    <cvalue name='hg38BaseDirectory' value='/icgc/ngs_share/assemblies/hg_GRCh38' type='path' />    
-    
+    <cvalue name='hg38BaseDirectory' value='/icgc/ngs_share/assemblies/hg_GRCh38' type='path' />
+
     <!-- Reference genomes -->
     <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
     <cvalue name='INDEX_PREFIX' value="${hg38BaseDirectory}/indexes/bwa/bwa07_GRCh38_decoy_ebv_phiX_alt_hla_chr/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
@@ -20,11 +20,11 @@
 
     <cvalue name='CHR_PREFIX' value='chr' type='string' />
     <cvalue name="disableAutoBAMHeaderAnalysis" value="true" type="boolean"/>
-    
+
     <!--## Databases -->
     <cvalue name='hg38DatabasesDirectory' value="${hg38BaseDirectory}/databases" type='path' />
-    <cvalue name='hg38DatabaseUCSCDirectory' value="${hg38DatabasesDirectory}/UCSC" type='path' />    
-    
+    <cvalue name='hg38DatabaseUCSCDirectory' value="${hg38DatabasesDirectory}/UCSC" type='path' />
+
     <cvalue name="DBSNP" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
     <cvalue name="DBSNP_COL" value="DBSNP" type="string"/>
     <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
@@ -42,11 +42,11 @@
     <cvalue name='ANNOVAR_BUILDVER' value='hg38' type='string'/>
     <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/humandb/' type='path'/>
     <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV31'" type='string'/>
-    
+
     <!--Reference annotations during confidene annotation-->
     <cvalue name="CONFIDENCE_OPTS" value="-t 500 --refgenome GRCh38 ftp://ftp.sanger.ac.uk/pub/cancer/dockstore/human/GRCh38_hla_decoy_ebv/core_ref_GRCh38_hla_decoy_ebv.tar.gz" type="string"
             description="max read depth in control (-t INT default 150 for genomes, set to 500 for exomes) can now be ommited to avoid depth dependent punishing by setting -c 0. It is also possible to get a justification for the confidence score by setting the -p 1 flag"/>
-    
+
     <!--SNV reliability bundle-->
     <configurationValueBundle name='PIPE_CONFIG:SNV_RELIABILITY'>
      	<cvalue name='MAPABILITY' value='${hg38DatabasesDirectory}/mappability/GRCh38_Mappability_Align_100mer.bedGraph.gz' type="path"/>

From aef5111697bc116fcd90c6910b141f1ed12f3daf Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Mon, 26 Apr 2021 17:03:33 +0200
Subject: [PATCH 20/49] Uncommenting reference detection

---
 resources/analysisTools/snvPipeline/snvAnnotation.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh
index 856ef35..94a4bbe 100755
--- a/resources/analysisTools/snvPipeline/snvAnnotation.sh
+++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh
@@ -36,8 +36,8 @@ scriptDirectory=`dirname ${WRAPPED_SCRIPT}`
 #TOOLS_DIR=${BASEPATH_TOOLS}
 FILEBASE=`dirname ${FILENAME_VCF_IN}`/`basename ${FILENAME_VCF_IN} .vcf`
 
-#source ${TOOL_ANALYZE_BAM_HEADER}
-#getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
+source ${TOOL_ANALYZE_BAM_HEADER}
+getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFIX and REFERENCE_GENOME
 
 
 [[ -f ${FILENAME_CHECKPOINT} ]] && rm ${FILENAME_CHECKPOINT}

From 22a55ecd90ebee01de7ccfe116a54c339294df7f Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Fri, 30 Apr 2021 11:31:45 +0200
Subject: [PATCH 21/49] updating refgenome help

---
 .../analysisTools/snvPipeline/confidenceAnnotation_SNVs.py  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index 22be339..f1b03ae 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -776,8 +776,10 @@ def main(args):
     parser.add_argument("-g", "--refgenome", dest="refgenome", nargs=2,
                         default=["hs37d5", "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/" \
                                            "phase2_reference_assembly_sequence/hs37d5.fa.gz", ],
-                        help="reference genome used for calling ID, path (default hs37d5, ftp://ftp.1000genomes.ebi" \
-                             ".ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)")
+                        help="Argument's first value is used to determine if the workflow is using hg19 or hg38 reference genome. " \
+                             "Allowed first values are 'hs37d5' or 'GRCh37', second value could be the downloaded URL of the reference genome. " \
+                             "(default hs37d5, ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)" \
+                             "")
     parser.add_argument("-z", "--center", dest="center", nargs="?", default="DKFZ",
                         help="Center (unclear if the center where the data was produced or the center of the " \
                              "calling pipeline; default DKFZ).")

From 71900b37e8795b1cbeb316c6284b7b78bda72840 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Fri, 30 Apr 2021 11:40:27 +0200
Subject: [PATCH 22/49] Raise error: unknown alignment suffix

---
 resources/analysisTools/snvPipeline/filter_PEoverlap.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
index 03db8dc..4807e8a 100755
--- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py
+++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
@@ -139,6 +139,8 @@ def performAnalysis(args):
     elif args.alignmentFile.split(".")[-1] == "cram":
         mode += "c"
         samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file)
+    else:
+        raise "Unknown file alignment suffix '%s'. Need 'bam' or 'cram'" % args.alignmentFile.split(".")[-1]
 
     if args.altPosF != '':
         ALT_basePositions_file = args.altPosF

From dbd835d66161591fc858d7733433c5d341df43a9 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Fri, 30 Apr 2021 11:46:30 +0200
Subject: [PATCH 23/49] Reformatting in_dbSNPcounter.pl

---
 .../snvPipeline/in_dbSNPcounter.pl            | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
index 737ae04..28c1a32 100755
--- a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
+++ b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
@@ -49,11 +49,26 @@
 my $i = 0;
 while($i < @head)
 {
-	if($head[$i] eq "DBSNP"){$col{"DBSNP"} = $i; print STDERR "DBSNP in column ", $i+1,"\n";}
-	if($head[$i] eq "1K_GENOMES"){$col{"1K_GENOMES"} = $i; print STDERR "1K_GENOMES in column ", $i+1,"\n";}
-	if($head[$i] eq "RECLASSIFICATION"){$col{"RECLASSIFICATION"} = $i; print STDERR "RECLASSIFICATION in column ", $i+1,"\n";}
-	if($head[$i] eq "ANNOTATION_control"){$col{"ANNOTATION_control"} = $i; print STDERR "ANNOTATION_control in column ", $i+1,"\n";}
-	if($head[$i] eq "CONFIDENCE"){$col{"CONFIDENCE"} = $i; print STDERR "CONFIDENCE in column ", $i+1,"\n";}
+	if($head[$i] eq "DBSNP"){
+		$col{"DBSNP"} = $i;
+		print STDERR "DBSNP in column ", $i+1,"\n";
+	}
+	if($head[$i] eq "1K_GENOMES"){
+		$col{"1K_GENOMES"} = $i;
+		print STDERR "1K_GENOMES in column ", $i+1,"\n";
+	}
+	if($head[$i] eq "RECLASSIFICATION"){
+		$col{"RECLASSIFICATION"} = $i;
+		print STDERR "RECLASSIFICATION in column ", $i+1,"\n";
+	}
+	if($head[$i] eq "ANNOTATION_control"){
+		$col{"ANNOTATION_control"} = $i;
+		print STDERR "ANNOTATION_control in column ", $i+1,"\n";
+	}
+	if($head[$i] eq "CONFIDENCE"){
+		$col{"CONFIDENCE"} = $i;
+		print STDERR "CONFIDENCE in column ", $i+1,"\n";
+	}
 	$i++;
 }
 

From df184d9806302e52773e4d7ca0d00da021aa92c8 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Fri, 30 Apr 2021 12:00:11 +0200
Subject: [PATCH 24/49] Reformatting IMD R file

---
 .../snvPipeline/intermutationDistance_Coord_color.r        | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r b/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r
index 67ec8b3..faa1757 100755
--- a/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r
+++ b/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r
@@ -46,8 +46,11 @@ data <- read.table(opt$chrLengthFile, header=FALSE)
 chrLength <- data.frame(data)
 rownames(chrLength) <- chrLength$V1
 
-if((opt$chrPrefix != "" && grepl(opt$chrPrefix, chrLength$V1[1])) && !grepl(opt$chrPrefix, c.array[1])){
-	c.array <- paste0(opt$chrPrefix, c.array, opt$chrSuffix)
+if(opt$chrPrefix != "" && 
+  grepl(opt$chrPrefix, chrLength$V1[1]) && 
+	!grepl(opt$chrPrefix, c.array[1])){
+
+    c.array <- paste0(opt$chrPrefix, c.array, opt$chrSuffix)
 }
 
 xtotal <- sum(chrLength[c.array,2]/10)

From f93779c5672c47e81de8922911c73b3ca81a0d6e Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 29 Sep 2021 11:10:32 +0200
Subject: [PATCH 25/49] Increasing the mem requirement

---
 resources/configurationFiles/analysisSNVCalling.xml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
index 61e4bf7..07ec8de 100755
--- a/resources/configurationFiles/analysisSNVCalling.xml
+++ b/resources/configurationFiles/analysisSNVCalling.xml
@@ -231,8 +231,8 @@
                 <rset size="t" memory="1" cores="1" nodes="1" walltime="1" queue="devel"/>
                 <rset size="s" memory="3" cores="2" nodes="1" walltime="6"/>
                 <rset size="m" memory="5" cores="3" nodes="1" walltime="12"/>
-                <rset size="l" memory="5" cores="3" nodes="1" walltime="24"/>
-                <rset size="xl" memory="6" cores="3" nodes="1" walltime="48"/>
+                <rset size="l" memory="7" cores="3" nodes="1" walltime="24"/>
+                <rset size="xl" memory="9" cores="3" nodes="1" walltime="48"/>
             </resourcesets>
             <input type="file" typeof="de.dkfz.b080.co.files.SNVAnnotationFile" scriptparameter="FILENAME_VCF_IN"/>
             <input type="file" typeof="de.dkfz.b080.co.files.TumorBamFile" scriptparameter="TUMOR_BAMFILE_FULLPATH_BP"/>
@@ -247,7 +247,7 @@
                 <rset size="t" memory="1" cores="1" nodes="1" walltime="1" queue="devel"/>
                 <rset size="s" memory="1" cores="3" nodes="1" walltime="4"/>
                 <rset size="m" memory="1" cores="3" nodes="1" walltime="8"/>
-                <rset size="l" memory="1" cores="3" nodes="1" walltime="12"/>
+                <rset size="l" memory="2" cores="3" nodes="1" walltime="12"/>
                 <rset size="xl" memory="4" cores="3" nodes="1" walltime="24"/>
             </resourcesets>
             <input type="file" typeof="de.dkfz.b080.co.files.SNVAnnotationFile" scriptparameter="FILENAME_VCF"/>
@@ -261,7 +261,7 @@
                 <rset size="t" memory="1" cores="1" nodes="1" walltime="1" queue="devel"/>
                 <rset size="s" memory="1" cores="1" nodes="1" walltime="1"/>
                 <rset size="m" memory="1" cores="1" nodes="1" walltime="2"/>
-                <rset size="l" memory="1" cores="1" nodes="1" walltime="4"/>
+                <rset size="l" memory="2" cores="1" nodes="1" walltime="4"/>
                 <rset size="xl" memory="4" cores="1" nodes="1" walltime="8"/>
             </resourcesets>
             <input type="file" typeof="de.dkfz.b080.co.files.SNVAnnotationFile" scriptparameter="FILENAME_VCF"/>

From 7beae9320de5f063a6654a9e9e5d48b020dc10b7 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Mon, 18 Oct 2021 12:25:24 +0200
Subject: [PATCH 26/49] safer IO for in_dbSNPcounter.pl

---
 .../snvPipeline/in_dbSNPcounter.pl            | 117 +++++++++---------
 1 file changed, 58 insertions(+), 59 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
index 28c1a32..0c0b93d 100755
--- a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
+++ b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
@@ -23,7 +23,7 @@
 {
 	$minscore = 8;
 }
-open (FH, $file) or die "Could not open $file: $!\n";
+open (my $FH, $file) or die "Could not open $file: $!\n";
 
 # count all, somatic, s. with minconfidence, in dbSNP, in 1KG
 my $all = 0;
@@ -37,75 +37,75 @@
 
 
 my $header = "";
-while ($header = <FH>)
-{
-	last if ($header =~ /^\#CHROM/); # that is the line with the column names
-}
-chomp $header;
+my ($DBSBP, $KG, $CONFIDENCE, $CLASSIFICATION);
 
-my @head=split("\t", $header);
-my %col;
+while(!eof($FH)){
+	my $line = readline($FH) || die "Could not read line: $?";
+	chomp $line;
+	if ($line =~ /^#/)
+	{
+		#print STDOUT "HELLO\n";
+		if ($line =~/^#CHROM/){
+			chomp $line;
 
-my $i = 0;
-while($i < @head)
-{
-	if($head[$i] eq "DBSNP"){
-		$col{"DBSNP"} = $i;
-		print STDERR "DBSNP in column ", $i+1,"\n";
-	}
-	if($head[$i] eq "1K_GENOMES"){
-		$col{"1K_GENOMES"} = $i;
-		print STDERR "1K_GENOMES in column ", $i+1,"\n";
-	}
-	if($head[$i] eq "RECLASSIFICATION"){
-		$col{"RECLASSIFICATION"} = $i;
-		print STDERR "RECLASSIFICATION in column ", $i+1,"\n";
-	}
-	if($head[$i] eq "ANNOTATION_control"){
-		$col{"ANNOTATION_control"} = $i;
-		print STDERR "ANNOTATION_control in column ", $i+1,"\n";
-	}
-	if($head[$i] eq "CONFIDENCE"){
-		$col{"CONFIDENCE"} = $i;
-		print STDERR "CONFIDENCE in column ", $i+1,"\n";
-	}
-	$i++;
-}
+			my @head=split("\t", $line);
+			my %col;
 
-my $DBSBP = $col{"DBSNP"};
-my $KG = $col{"1K_GENOMES"};
-my $CONFIDENCE = $col{"CONFIDENCE"};
-my $CLASSIFICATION = $col{"RECLASSIFICATION"};
+			my $i = 0;
+			while($i < @head)
+			{
+				if($head[$i] eq "DBSNP"){
+					$col{"DBSNP"} = $i;
+					print STDERR "DBSNP in column ", $i+1,"\n";
+				}
+				if($head[$i] eq "1K_GENOMES"){
+					$col{"1K_GENOMES"} = $i;
+					print STDERR "1K_GENOMES in column ", $i+1,"\n";
+				}
+				if($head[$i] eq "RECLASSIFICATION"){
+					$col{"RECLASSIFICATION"} = $i;
+					print STDERR "RECLASSIFICATION in column ", $i+1,"\n";
+				}
+				if($head[$i] eq "ANNOTATION_control"){
+					$col{"ANNOTATION_control"} = $i;
+					print STDERR "ANNOTATION_control in column ", $i+1,"\n";
+				}
+				if($head[$i] eq "CONFIDENCE"){
+					$col{"CONFIDENCE"} = $i;
+					print STDERR "CONFIDENCE in column ", $i+1,"\n";
+				}
+				$i++;
+			}
 
-# I know that it's evilly hardcoded!
-while (<FH>)
-{
-	if ($_ =~ /^#/)	# skip header lines
-	{
-		next;
+			$DBSBP = $col{"DBSNP"};
+			$KG = $col{"1K_GENOMES"};
+			$CONFIDENCE = $col{"CONFIDENCE"};
+			$CLASSIFICATION = $col{"RECLASSIFICATION"};
+		}
 	}
-	$all++;
-	@help = split ("\t", $_);
-	if ($help[$CLASSIFICATION] =~ /somatic/)
-	{
-		$somatic++;
-		if ($help[$CONFIDENCE] >= $minscore)
+	else{
+		$all++;
+		@help = split ("\t", $line);
+		if ($help[$CLASSIFICATION] =~ /somatic/)
 		{
-			$scored++;
-			#print STDERR $_;
-			if ($help[$DBSBP] =~ /MATCH=exact/)
-			{
-				$dbSNP++;
-			}
-			if ($help[$KG] =~ /MATCH=exact/)
+			$somatic++;
+			if ($help[$CONFIDENCE] >= $minscore)
 			{
-				$oneKG++;
+				$scored++;
+				if ($help[$DBSBP] =~ /MATCH=exact/)
+				{
+					$dbSNP++;
+				}
+				if ($help[$KG] =~ /MATCH=exact/)
+				{
+					$oneKG++;
+				}
 			}
 		}
 	}
 }
+close $FH;
 
-close FH;
 if ($scored > 0)
 {
 	$perc_dbSNP = sprintf ("%.2f", ($dbSNP/$scored*100));
@@ -116,6 +116,5 @@
 else	# there might be no ...
 {
 	die "!!! There are no SNVs with confidence >= $minscore in the input, it makes no sense to do further analyses with the current settings on these data!!!\n";
-	
 }
 exit;

From d27035818c8f2294fd678f5669b8fb58165291bf Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Wed, 4 May 2022 09:51:07 +0200
Subject: [PATCH 27/49] Add variant calling in HLA/ALT contigs

---
 .../snvPipeline/PurityReloaded.py             |  4 +++
 .../snvPipeline/createErrorPlots.py           |  2 +-
 .../snvPipeline/filter_PEoverlap.py           |  6 +++-
 .../analysisTools/snvPipeline/snvCalling.sh   | 28 ++++++++++++++++---
 .../vcf_pileup_compare_allin1_basecount.pl    | 14 +++++++---
 .../analysisSNVCallingGRCh38.xml              | 10 +++++++
 6 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/PurityReloaded.py b/resources/analysisTools/snvPipeline/PurityReloaded.py
index e3aadf9..b8dcb7d 100755
--- a/resources/analysisTools/snvPipeline/PurityReloaded.py
+++ b/resources/analysisTools/snvPipeline/PurityReloaded.py
@@ -251,6 +251,10 @@ def parseVcf(file,num):
 	while (l!= ""):
 		t=l.split('\t')
 		if (t[0][0] != "#") and isValid(t):
+			# Skiping the non-primary assembly variants from purity calculations
+			if t[0].startswith('HLA') or t[0].endswith('_alt'):
+				l=vcf.readline()
+				continue
 			i = chromMap[t[0]]
 			if (t[12]=="germline"):
 			  #DP5=string.split(string.split(t[11],";")[1],",")
diff --git a/resources/analysisTools/snvPipeline/createErrorPlots.py b/resources/analysisTools/snvPipeline/createErrorPlots.py
index 2ed2416..dc7dbfc 100755
--- a/resources/analysisTools/snvPipeline/createErrorPlots.py
+++ b/resources/analysisTools/snvPipeline/createErrorPlots.py
@@ -31,7 +31,7 @@ def calculateLogSize(size, max_val, base):
 	return math.log(((size/max_val)*(base-1.))+1., base)
 
 def calculateRootedSize(size, max_val):
-	if(float(size) != 0.0):
+	if(float(size) != 0.0 and float(max_val) != 0.0):
 		return np.sqrt(size/max_val)
 	else:
 		return 0.0
diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
index 4807e8a..6d19fcf 100755
--- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py
+++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
@@ -210,7 +210,11 @@ def performAnalysis(args):
             count_nonREFnonALT = BoolCounter(REF, ALT) # " for nonREF and nonALT bases
 
             # To match pysam and mpileup counts, a reference file is added. Given the reference file, Pysam by default computes BAQ (compute_baq).
-            for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=3844, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators):
+            if chrom.endswith('alt') or chrom.startswith('HLA'):
+                flag_filer_value = 1796
+            else:
+                flag_filer_value = 3844
+            for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=flag_filer_value, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators):
                 if pileupcolumn.pos == (pos-1):
                     #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments)
                     for pileupread in pileupcolumn.pileups:
diff --git a/resources/analysisTools/snvPipeline/snvCalling.sh b/resources/analysisTools/snvPipeline/snvCalling.sh
index de88566..d931226 100755
--- a/resources/analysisTools/snvPipeline/snvCalling.sh
+++ b/resources/analysisTools/snvPipeline/snvCalling.sh
@@ -30,7 +30,12 @@ getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFI
 chr=""
 if [[ -n "$PARM_CHR_INDEX" ]]
 then
-    chr=${CHR_PREFIX}${PARM_CHR_INDEX}${CHR_SUFFIX}
+    if [[ "$PARM_CHR_INDEX" == "ALT" || "$PARM_CHR_INDEX" == "HLA" ]]
+    then
+        chr=${PARM_CHR_INDEX}
+    else
+        chr=${CHR_PREFIX}${PARM_CHR_INDEX}${CHR_SUFFIX}
+    fi
 else
     chr=${CHR_PREFIX}${CHROMOSOME_INDICES[$((RODDY_JOBARRAYINDEX-1))]}${CHR_SUFFIX}
     # File names can contain X and Y. The pattern contains RODDY_JOBARRAYINDEX so this has to be changed temporary as the value is always numeric!
@@ -66,8 +71,23 @@ filenameMPileupError="${MPILEUP_SUBDIR}/${MPILEUPOUT_PREFIX}${PID}.${chr}.error"
 
 FILENAME_VCF_SNVS_TEMP=${FILENAME_VCF_SNVS}.tmp
 
-#[[ ! -f $filenameMPileupTemp ]] && 
-${BCFTOOLS_BINARY} mpileup ${MPILEUP_OPTS} -r ${chr} -f ${REFERENCE_GENOME} $tumorbamfullpath | ${BCFTOOLS_BINARY} call ${BCFTOOLS_OPTS} > $filenameMPileupTemp
+## For variant calling from ALT and HLA region in the GRCh38 genome
+# The *_CONTIGS_FILE contains the list of the contigs names
+MPILEUP_REGION_OPTION=""
+if [[ "$chr" == "HLA" ]]
+then
+    MPILEUP_REGION_OPTION="-R ${CHROMOSOME_HLA_CONTIGS_FILE}"
+    MPILEUP_OPTS=$MPILEUP_OPTS_ALT_HLA
+elif [[ "$chr" == "ALT" ]]
+then
+    MPILEUP_REGION_OPTION="-R ${CHROMOSOME_ALT_CONTIGS_FILE}"
+    MPILEUP_OPTS=$MPILEUP_OPTS_ALT_HLA
+else
+    MPILEUP_REGION_OPTION="-r ${chr}"
+fi
+
+#[[ ! -f $filenameMPileupTemp ]] &&
+${BCFTOOLS_BINARY} mpileup ${MPILEUP_OPTS} ${MPILEUP_REGION_OPTION} -f ${REFERENCE_GENOME} $tumorbamfullpath | ${BCFTOOLS_BINARY} call ${BCFTOOLS_OPTS} > $filenameMPileupTemp
 
 if [[ "$?" != 0 ]]
 then
@@ -103,7 +123,7 @@ if [[ -n "$firstLineVCF" ]]; then
             mkfifo $NP_MPILEUP
 
             # if there is a germline BAM, first look up these positions in the control file by just piling up the bases, this is NO SNV calling
-            ${SAMTOOLS_BINARY} mpileup ${MPILEUPCONTROL_OPTS} -r ${chr} -l ${filenameMPileupOut} -f ${REFERENCE_GENOME} ${CONTROL_BAMFILE_FULLPATH_BP} > ${NP_MPILEUP} &
+            ${SAMTOOLS_BINARY} mpileup ${MPILEUPCONTROL_OPTS} -l ${filenameMPileupOut} -f ${REFERENCE_GENOME} ${CONTROL_BAMFILE_FULLPATH_BP} > ${NP_MPILEUP} &
             ${PERL_BINARY} ${TOOL_VCF_PILEUP_COMPARE} ${filenameMPileupOut} $NP_MPILEUP "Header" > ${FILENAME_VCF_SNVS_TEMP}
 
             rm $NP_MPILEUP
diff --git a/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl b/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl
index e455d0a..7700ab8 100755
--- a/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl
+++ b/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl
@@ -136,6 +136,9 @@
 my $ccoord = -1; # make sure that ccoord in first iteration is smaller than tcoord
 my $tcoord = 0;
 
+my $current_c_chr = "";
+my $current_t_chr = "";
+
 # one file is longer than the other, nevermind which
 #(if we did pileup for only the tumor SNV positions, control will be shorter; otherwise longer)
 TUMORFILE_LOOP: while ($lineT=<T>)
@@ -150,7 +153,9 @@
 	chomp $lineT;
 	@tum = split (/\s+/, $lineT);
 	$tcoord = $tum[1];	# start coordinate
-	if ($tcoord < $ccoord)	# no matching control line => tumor position not covered in control (never true in 1st iteration)
+	$current_t_chr = $tum[0];
+
+	if ($tcoord < $ccoord && $current_t_chr eq $current_c_chr)	# no matching control line => tumor position not covered in control (never true in 1st iteration)
 	{
 		$missingC++;
 		# print the tumor line with according "flag"
@@ -158,7 +163,7 @@
 		print $lineT, "\tDP=0;DP5=0,0,0,0,0;DP5all=0,0,0,0,0;ACGTNacgtnHQ=0,0,0,0,0,0,0,0,0,0;ACGTNacgtn=0,0,0,0,0,0,0,0,0,0;VAF=0;TSR=0\t$notcovered\n";
 		next;	# read next line from tumor
 	}
-	if ($tcoord == $ccoord)	# matching pair found!
+	if ($tcoord == $ccoord && $current_t_chr eq $current_c_chr)	# matching pair found!
 	{
 		$match++;
 		chomp $lineC;
@@ -176,15 +181,16 @@
 		$ctrC++;
 		# split lines
 		@ctrl = split (/\s+/, $lineC);
+		$current_c_chr = $ctrl[0];
 		$ccoord = $ctrl[1];
-		if ($tcoord == $ccoord)	# matching pair found!
+		if ($tcoord == $ccoord && $current_t_chr eq $current_c_chr)	# matching pair found!
 		{
 			$match++;
 			# do the evaluation in a subroutine
 			evaluate_pos();
 			next TUMORFILE_LOOP; # and read next line from tumor
 		}
-		if ($tcoord < $ccoord)	# new ccoord is higher than tcoord =>  tumor position not covered in control (should never be the case!)
+		if ($tcoord < $ccoord && $current_t_chr eq $current_c_chr)	# new ccoord is higher than tcoord =>  tumor position not covered in control (should never be the case!)
 		{
 			$missingC++;
 			print $lineT, "\tDP=0;DP5=0,0,0,0,0;DP5all=0,0,0,0,0;ACGTNacgtnHQ=0,0,0,0,0,0,0,0,0,0;ACGTNacgtn=0,0,0,0,0,0,0,0,0,0;VAF=0;TSR=0\t$notcovered\n";
diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
index d8827b2..974b29c 100644
--- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
@@ -18,6 +18,16 @@
     <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
     <cvalue name='CHROMOSOME_LENGTH_FILE' value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLength.tsv' type="path" />
 
+    <!--Chr indices for the calling-->
+    <cvalue name="CHROMOSOME_INDICES" value="( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y HLA ALT )" type="bashArray" />
+    <cvalue name="CHROMOSOME_HLA_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.HLA_contigs_list.txt' type="path" />
+    <cvalue name="CHROMOSOME_ALT_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.ALT_contigs_list.txt' type="path" />
+
+    <!--BCFtools options fo the variant calling-->
+    <cvalue name='MPILEUP_OPTS' value='"-EI -q 30 -O u --ignore-RG --ff UNMAP,SECONDARY,QCFAIL,DUP,SUPPLEMENTARY -d 9999 -a AD -x"' type="string"/>
+    <cvalue name='MPILEUP_OPTS_ALT_HLA' value='"-A -EI -q 30 -O u --ignore-RG --ff UNMAP,SECONDARY,QCFAIL,DUP -d 9999 -a AD -x"' type="string"/>
+    <cvalue name='BCFTOOLS_OPTS' value='"-vc -p 2.0 --ploidy GRCh38"' type="string"/>
+
     <cvalue name='CHR_PREFIX' value='chr' type='string' />
     <cvalue name="disableAutoBAMHeaderAnalysis" value="true" type="boolean"/>
 

From b683df259d3528269448835a1a154b3dc7d2678f Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Wed, 4 May 2022 09:51:53 +0200
Subject: [PATCH 28/49] Update ngs_share path

---
 resources/configurationFiles/analysisSNVCalling.xml       | 4 ++--
 resources/configurationFiles/analysisSNVCallingGRCh38.xml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
index 07ec8de..8ad7427 100755
--- a/resources/configurationFiles/analysisSNVCalling.xml
+++ b/resources/configurationFiles/analysisSNVCalling.xml
@@ -45,8 +45,8 @@
         <cvalue name='REFERENCE_GENOME' value="${referenceGenome_hg19_chr}" type="string"/>
 
         <!--## Databases -->
-        <cvalue name='hg19DatabasesDirectory' value="/icgc/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases" type="path" />
-        <cvalue name="hg19DatabaseUCSCDirectory" value="/icgc/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases/UCSC" type="path" />
+        <cvalue name='hg19DatabasesDirectory' value="/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases" type="path" />
+        <cvalue name="hg19DatabaseUCSCDirectory" value="/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg19_GRCh37_1000genomes/databases/UCSC" type="path" />
 
         <cvalue name="DBSNP" value="${hg19DatabasesDirectory}/dbSNP/dbSNP_147/00-All.SNV.vcf.gz" type="path"/>
         <cvalue name="DBSNP_COL" value="DBSNP" type="string"/>
diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
index 974b29c..d1c2e79 100644
--- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
@@ -10,7 +10,7 @@
   <configurationvalues>
 
     <!--hg38 base directory-->
-    <cvalue name='hg38BaseDirectory' value='/icgc/ngs_share/assemblies/hg_GRCh38' type='path' />
+    <cvalue name='hg38BaseDirectory' value='/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg_GRCh38' type='path' />
 
     <!-- Reference genomes -->
     <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>

From 1c0bc26745028ac3c8cbc8c72977ac1ba935e00d Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 30 May 2022 11:01:02 +0200
Subject: [PATCH 29/49] Remove RE/MAP from hg38 penalties

---
 .../snvPipeline/confidenceAnnotation_SNVs.py  | 121 +++++++++---------
 1 file changed, 60 insertions(+), 61 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index f1b03ae..d1089cf 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -311,39 +311,38 @@ def main(args):
                 reasons += "bias_filter_round2_SEQ(-3)"
                 filterfield["BI"] = 1
 
-        # 2) annotations of regions that cause problems: some classes of repeats from RepeatMasker track,
-        # segmental duplications, (cf. Reumers et al. 2012, Nature Biotech 30:61), external blacklists, mapability
-        # simple repeats and low complexity (not the same as homopolymer, but similar enough);
-        # some satellites are not annotated in blacklist ...
-        if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]):
-            is_repeat = True
-            confidence -= 2
-            reasons += "Simple_repeat(-2)"
-            filterfield["RE"] = 1
-        # other repeat elements to penalize at least a bit
-        elif help["REPEAT_MASKER_VALID"]:
-            confidence -= 1
-            reasons += "Other_repeat(-1)"
-            filterfield["RE"] = 1
-
-        # simple tandem repeats most often coincide with other bad features - do not penalize twice
-        if help["SIMPLE_TANDEMREPEATS_VALID"]:
-            is_STR = 1
-            if not is_repeat:
+        # Only for hg19 reference genome
+        if args.refgenome[0] == "hs37d5":
+            # 2) annotations of regions that cause problems: some classes of repeats from RepeatMasker track,
+            # segmental duplications, (cf. Reumers et al. 2012, Nature Biotech 30:61), external blacklists, mapability
+            # simple repeats and low complexity (not the same as homopolymer, but similar enough);
+            # some satellites are not annotated in blacklist ...
+            if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]):
+                is_repeat = True
                 confidence -= 2
-                reasons += "Tandem_repeat(-2)"
+                reasons += "Simple_repeat(-2)"
+                filterfield["RE"] = 1
+            # other repeat elements to penalize at least a bit
+            elif help["REPEAT_MASKER_VALID"]:
+                confidence -= 1
+                reasons += "Other_repeat(-1)"
                 filterfield["RE"] = 1
 
-        # Segmental Duplications are less effective than homopolymers, short tandem repeats and microsatellites,
-        # do not penality twice
-        if help["ANNOVAR_SEGDUP_COL_VALID"] and not (is_repeat or is_STR):
-            confidence -= 2 # bad region
-            is_weird = True
-            reasons += "Segmental_dup(-2)"
-            filterfield["RE"] = 1
+            # simple tandem repeats most often coincide with other bad features - do not penalize twice
+            if help["SIMPLE_TANDEMREPEATS_VALID"]:
+                is_STR = 1
+                if not is_repeat:
+                    confidence -= 2
+                    reasons += "Tandem_repeat(-2)"
+                    filterfield["RE"] = 1
 
-        # Only for hg19 reference genome
-        if args.refgenome[0] == "hs37d5":
+            # Segmental Duplications are less effective than homopolymers, short tandem repeats and microsatellites,
+            # do not penality twice
+            if help["ANNOVAR_SEGDUP_COL_VALID"] and not (is_repeat or is_STR):
+                confidence -= 2 # bad region
+                is_weird = True
+                reasons += "Segmental_dup(-2)"
+                filterfield["RE"] = 1
 
             # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat
             if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]:
@@ -360,44 +359,44 @@ def main(args):
                 reasons += "Hiseqdepth(-3)"
                 filterfield["HSDEPTH"] = 1
 
-        # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ...
-        # Everything with really high number of occurences is artefacts
-        # does not always correlate with the above regions
-        # is overestimating badness bc. of _single_ end read simulations
-        if help["MAPABILITY"] == ".":
-            # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0
-            confidence -= 5
-            reasons += "Not_mappable(-5)"
-            filterfield["MAP"] = 1
-        else:
-            reduce = 0
-            mapability = min(map(float, help["MAPABILITY"].split("&")))
-            if mapability < 0.5:
-                # 0.5 does not seem to be that bad: region appears another time in
-                # the genome and we have paired end data!
-                confidence -= 1
-                reduce += 1
-
-                is_weird = True # something _is_ weird already there and known SNPs might be artefacts
-
-                if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad
+            # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ...
+            # Everything with really high number of occurences is artefacts
+            # does not always correlate with the above regions
+            # is overestimating badness bc. of _single_ end read simulations
+            if help["MAPABILITY"] == ".":
+                # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0
+                confidence -= 5
+                reasons += "Not_mappable(-5)"
+                filterfield["MAP"] = 1
+            else:
+                reduce = 0
+                mapability = min(map(float, help["MAPABILITY"].split("&")))
+                if mapability < 0.5:
+                    # 0.5 does not seem to be that bad: region appears another time in
+                    # the genome and we have paired end data!
                     confidence -= 1
                     reduce += 1
 
-                if mapability < 0.25: # > 4 times appearing region
-                    confidence -= 1
-                    reduce += 1
+                    is_weird = True # something _is_ weird already there and known SNPs might be artefacts
 
-                if mapability < 0.1: # > 5 times is bad
-                    confidence -= 2
-                    reduce += 2
+                    if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad
+                        confidence -= 1
+                        reduce += 1
 
-                if mapability < 0.05: # these regions are clearly very bad (Lego stacks)
-                    confidence -= 3
-                    reduce += 3
+                    if mapability < 0.25: # > 4 times appearing region
+                        confidence -= 1
+                        reduce += 1
 
-                filterfield["MAP"] = 1
-                reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce)
+                    if mapability < 0.1: # > 5 times is bad
+                        confidence -= 2
+                        reduce += 2
+
+                    if mapability < 0.05: # these regions are clearly very bad (Lego stacks)
+                        confidence -= 3
+                        reduce += 3
+
+                    filterfield["MAP"] = 1
+                    reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce)
 
         # if others have found the SNP already, it may be interesting despite low score
         # - but only if it's not a weird region.

From 6d479821b36cb8d700ba9d0ff10e6d2474fdc0c5 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Fri, 8 Jul 2022 12:51:35 +0200
Subject: [PATCH 30/49] Add m2e2,HLA/ALT mappability files

---
 resources/configurationFiles/analysisSNVCallingGRCh38.xml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
index d1c2e79..1bb222f 100644
--- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
@@ -20,8 +20,8 @@
 
     <!--Chr indices for the calling-->
     <cvalue name="CHROMOSOME_INDICES" value="( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y HLA ALT )" type="bashArray" />
-    <cvalue name="CHROMOSOME_HLA_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.HLA_contigs_list.txt' type="path" />
-    <cvalue name="CHROMOSOME_ALT_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.ALT_contigs_list.txt' type="path" />
+    <cvalue name="CHROMOSOME_HLA_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.HLA_contigs.bed' type="path" />
+    <cvalue name="CHROMOSOME_ALT_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.ALT_contigs.bed' type="path" />
 
     <!--BCFtools options fo the variant calling-->
     <cvalue name='MPILEUP_OPTS' value='"-EI -q 30 -O u --ignore-RG --ff UNMAP,SECONDARY,QCFAIL,DUP,SUPPLEMENTARY -d 9999 -a AD -x"' type="string"/>
@@ -59,7 +59,7 @@
 
     <!--SNV reliability bundle-->
     <configurationValueBundle name='PIPE_CONFIG:SNV_RELIABILITY'>
-     	<cvalue name='MAPABILITY' value='${hg38DatabasesDirectory}/mappability/GRCh38_Mappability_Align_100mer.bedGraph.gz' type="path"/>
+     	<cvalue name='MAPABILITY' value='${hg38BaseDirectory}/stats/mappability_m2e2/GRCh38_Mappability_Align_100mer_m2e2.ALT_HLA.bedGraph.gz' type="path"/>
      	<cvalue name='SIMPLE_TANDEMREPEATS' value='${hg38DatabaseUCSCDirectory}/simple_tandem_repeats/2019-08-15_SimpleTandemRepeats_Human_hg38.sorted_plain.bed.gz:4' type="path"/>
      	<cvalue name='REPEAT_MASKER' value='${hg38DatabaseUCSCDirectory}/repeatMasker/2019-08-14_RepeatMasker_Human_hg38.sorted_plain.bed.gz' type="path"/>
     </configurationValueBundle>
@@ -75,4 +75,4 @@
       <cvalue name='ENCODE_TFBS' value='${hg38DatabaseUCSCDirectory}/TF_clusters/2019-08-15_Encode_TF_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
     </configurationValueBundle>
   </configurationvalues>
-</configuration>
\ No newline at end of file
+</configuration>

From 01567ac14d9c977dd33904737b5b4921cf764f5d Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Fri, 16 Sep 2022 15:07:41 +0200
Subject: [PATCH 31/49] Fix the diagnostic plots for GRCh38

---
 resources/analysisTools/snvPipeline/filter_vcf.sh      | 2 +-
 resources/analysisTools/snvPipeline/snvsPerChromPlot.r | 1 +
 resources/configurationFiles/analysisSNVCalling.xml    | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh
index 0e8f226..2ae1534 100755
--- a/resources/analysisTools/snvPipeline/filter_vcf.sh
+++ b/resources/analysisTools/snvPipeline/filter_vcf.sh
@@ -24,7 +24,7 @@ getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFI
 ALIGNMENT_FOLDER=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}`
 
 # bugfix: ensure to interpret CHROMOSOME_INDICES as array - otherwise TOOL_INTERMUTATION_DISTANCE_COORD_COLOR will fail...
-declare -a CHROMOSOME_INDICES="${CHROMOSOME_INDICES}"
+declare -a CHROMOSOME_INDICES="${CHROMOSOME_INDICES_PLOTTING}"
 numberOfChromosomes=${CHROMOSOME_INDICES[@]}
 outputFilenamePrefix=${mpileupDirectory}/${SNVFILE_PREFIX}${PID}
 outputFilenamePrefix_original=${outputFilenamePrefix}
diff --git a/resources/analysisTools/snvPipeline/snvsPerChromPlot.r b/resources/analysisTools/snvPipeline/snvsPerChromPlot.r
index 3e6c4cb..b4f3627 100755
--- a/resources/analysisTools/snvPipeline/snvsPerChromPlot.r
+++ b/resources/analysisTools/snvPipeline/snvsPerChromPlot.r
@@ -48,6 +48,7 @@ dat$chromosome <- factor(dat$chromosome, levels = paste0("chr",c(seq(1,22),"X","
 
 
 chromLength = read.table(file = opt$chromLengthFile, header = F)
+chromLength$V1 = gsub("chr", "", chromLength$V1)
 rownames(chromLength) = paste0("chr",chromLength$V1)
 chromLength = chromLength[,2, drop= F]
 chromLengthMB = round(chromLength/1000000)
diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
index 8ad7427..620a8ad 100755
--- a/resources/configurationFiles/analysisSNVCalling.xml
+++ b/resources/configurationFiles/analysisSNVCalling.xml
@@ -127,6 +127,8 @@
         <cvalue name="MEDIAN_FILTER_THRESHOLD" value='20' type="integer" description="SNVs with median of alternative base qualities below this threshold will be removed in the rerun filter step. Value of -1 means no filtering."/>
         <cvalue name="RERUN_SUFFIX" value="_filteredAltMedian${MEDIAN_FILTER_THRESHOLD}" type="string"/>
 
+        <!--CHROMSOME indices for plotting to ignore HLA and ALT contigs-->
+        <cvalue name="CHROMOSOME_INDICES_PLOTTING" value="( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y )" type="bashArray" />
 
         <configurationValueBundle name='PIPE_CONFIG:SNV_RELIABILITY'>
             <cvalue name='MAPABILITY' value='${hg19DatabaseUCSCDirectory}/wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz' type="path"/>

From 6321e0ac5634e3256b000af6efcd6e031003b73e Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Fri, 16 Sep 2022 15:11:11 +0200
Subject: [PATCH 32/49] Remove the quote for RAW_SNV_FILTER_OPTIONS

---
 resources/analysisTools/snvPipeline/snvCalling.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/analysisTools/snvPipeline/snvCalling.sh b/resources/analysisTools/snvPipeline/snvCalling.sh
index d931226..fc7d668 100755
--- a/resources/analysisTools/snvPipeline/snvCalling.sh
+++ b/resources/analysisTools/snvPipeline/snvCalling.sh
@@ -106,7 +106,7 @@ firstLineVCF=`cat $filenameMPileupTemp | grep -v "^#" | head -n1`
 if [[ -n "$firstLineVCF" ]]; then
     # filter out mutations with strand bias next to the motif GGnnGG (or CCnnCC if snv is on reverse strand)
     ${PERL_BINARY} "$TOOL_SEQ_CONTEXT_ANNOTATOR" "$FASTAFROMBED_BINARY" "$filenameMPileupTemp" "$REFERENCE_GENOME" 10 "$TOOLS_DIR" | \
-        ${PYTHON_BINARY} "$TOOL_RAW_SNV_FILTER" --outf="$snvOut" "$RAW_SNV_FILTER_OPTIONS" # --inf=$MPILEUP_SUBDIR/forStrandBiasFilter.${chr}.bed
+        ${PYTHON_BINARY} "$TOOL_RAW_SNV_FILTER" --outf="$snvOut" ${RAW_SNV_FILTER_OPTIONS} # --inf=$MPILEUP_SUBDIR/forStrandBiasFilter.${chr}.bed
 
     if [[ "$?" == 0 ]]
     then

From 0adf1e4527aae48c04f7200d3f4173099a6a40fc Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Thu, 6 Oct 2022 09:17:28 +0200
Subject: [PATCH 33/49] Upgrade to gencodev39 for hg38

---
 resources/configurationFiles/analysisSNVCallingGRCh38.xml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
index 1bb222f..47bb543 100644
--- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
@@ -48,10 +48,10 @@
     <cvalue name="LOCALCONTROL_WES" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_H021_MP_PL_WES.SNVs.AFgt1.vcf.gz" type="path" />
     <cvalue name="LOCALCONTROL_WES_COL" value="LocalControlAF_WES" type="string"/>
 
-    <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/annotate_variation.pl' type='path'/>
+    <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_Sept2022/annotate_variation.pl' type='path'/>
     <cvalue name='ANNOVAR_BUILDVER' value='hg38' type='string'/>
-    <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/humandb/' type='path'/>
-    <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV31'" type='string'/>
+    <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_Sept2022/humandb/' type='path'/>
+    <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV39'" type='string'/>
 
     <!--Reference annotations during confidene annotation-->
     <cvalue name="CONFIDENCE_OPTS" value="-t 500 --refgenome GRCh38 ftp://ftp.sanger.ac.uk/pub/cancer/dockstore/human/GRCh38_hla_decoy_ebv/core_ref_GRCh38_hla_decoy_ebv.tar.gz" type="string"

From c4fb48b0a2d0de64b9edfeb2e142fb2dcbf9e97e Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 7 Nov 2022 08:55:00 +0100
Subject: [PATCH 34/49] Bug fix with dbSNP counter

---
 resources/analysisTools/snvPipeline/in_dbSNPcounter.pl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
index 0c0b93d..c97155b 100755
--- a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
+++ b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl
@@ -80,7 +80,12 @@
 			$DBSBP = $col{"DBSNP"};
 			$KG = $col{"1K_GENOMES"};
 			$CONFIDENCE = $col{"CONFIDENCE"};
-			$CLASSIFICATION = $col{"RECLASSIFICATION"};
+
+			if ($col{"ANNOTATION_control"} > 0) {
+				$CLASSIFICATION = $col{"ANNOTATION_control"};
+			} else {
+				$CLASSIFICATION = $col{"RECLASSIFICATION"};
+			}
 		}
 	}
 	else{

From dc2ca36ee3223c965876aede00c18423461abdd7 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 7 Nov 2022 08:55:44 +0100
Subject: [PATCH 35/49] Update WGS local control

---
 resources/configurationFiles/analysisSNVCallingGRCh38.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
index 47bb543..58c5503 100644
--- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
@@ -43,7 +43,7 @@
     <cvalue name="GNOMAD_WES_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path"/>
     <cvalue name="GNOMAD_WGS_COL" value="GNOMAD_GENOMES" type="string"/>
     <cvalue name="GNOMAD_WGS_ALL_SNV" value= "${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.SNV.vcf.gz" type="path"/>
-    <cvalue name="LOCALCONTROL_WGS" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.SNVs.AFgt1.vcf.gz" type="path" />
+    <cvalue name="LOCALCONTROL_WGS" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2022/MPsnvs_PLindels/hg38_medullo_bcftools_mpileup_dev_hg38_alt_calling_AF0.05.vcf.gz" type="path" />
     <cvalue name="LOCALCONTROL_WGS_COL" value="LocalControlAF_WGS" type="string"/>
     <cvalue name="LOCALCONTROL_WES" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_H021_MP_PL_WES.SNVs.AFgt1.vcf.gz" type="path" />
     <cvalue name="LOCALCONTROL_WES_COL" value="LocalControlAF_WES" type="string"/>

From 973ae9c57fa235fac12b9145f5f67a637ab22cc6 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 7 Nov 2022 09:03:51 +0100
Subject: [PATCH 36/49] hg38: Add local control and gnomAD based confidence
 annotation

---
 .../snvPipeline/confidenceAnnotation_SNVs.py  | 46 +++++++++++++------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index d1089cf..5663dd2 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -83,6 +83,7 @@ def main(args):
                   '##FILTER=<ID=YALT,Description="Variant on Y chromosome with low allele frequency">\n' \
                   '##FILTER=<ID=VAF,Description="Variant allele frequency in tumor < ' + str(args.newpun) + ' times allele frequency in control">\n' \
                   '##FILTER=<ID=BI,Description="Bias towards a PCR strand or sequencing strand">\n' \
+                  '##FILTER=<ID=FREQ,Description="High frequency in GnomAD(>0.1%) or in local control database (>0.05%)">\n' \
                   '##SAMPLE=<ID=CONTROL,SampleName=control_' + args.pid + ',Individual=' + args.pid + ',Description="Control">\n' \
                   '##SAMPLE=<ID=TUMOR,SampleName=tumor_'+args.pid+',Individual='+args.pid+',Description="Tumor">\n'\
                   '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'
@@ -119,12 +120,12 @@ def main(args):
 
             variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", }
 
-            if args.no_control:
+            if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
                 variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$"
                 variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$"
                 variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$"
                 variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$"
-            else:
+            if not args.no_control:
                 fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ]
 
             header_indices = get_header_indices(headers, args.configfile, fixed_headers, variable_headers)
@@ -201,13 +202,15 @@ def main(args):
         is_weird = False # coindicende with known artefact-producing regions
         if args.no_control:
             classification = "somatic" # start with default somatic
+        else:
+            # for potential re-classification (e.g. low coverage in control and in dbSNP => probably germline)
+            classification = help["ANNOTATION_control"] # start with original classification
+
+        if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
             inGnomAD_WES = False
             inGnomAD_WGS = False
             inLocalControl_WES = False
             inLocalControl_WGS = False
-        else:
-            # for potential re-classification (e.g. low coverage in control and in dbSNP => probably germline)
-            classification = help["ANNOTATION_control"] # start with original classification
 
         ### For pancancer
         # genotype tumor as originally from mpileup
@@ -244,26 +247,27 @@ def main(args):
             if "COMMON=1" in help["DBSNP_COL"]:
                 is_commonSNP = True
 
-        if args.no_control:
+        if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
             if indbSNP and is_commonSNP and not is_clinic:
-                reasons += "dbSNP(NoControl)"
+                #reasons += "dbSNP(NoControl)"
+                pass
             if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))):
                 inGnomAD_WES = True
                 infofield["gnomAD_Exomes"] = "gnomAD_Exomes"
-                reasons += "gnomAD_Exomes(NoControl)"
+                #reasons += "gnomAD_Exomes(NoControl)"
             if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))):
                 inGnomAD_WGS = True
                 infofield["gnomAD_Genomes"] = "gnomAD_Genomes"
-                reasons += "gnomAD_Genomes(NoControl)"
+                #reasons += "gnomAD_Genomes(NoControl)"
 
             if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))):
                 inLocalControl_WGS = True
                 infofield["LocalControl_WGS"] = "LocalControl_WGS"
-                reasons += "LocalControl_WGS(NoControl)"
+                #reasons += "LocalControl_WGS(NoControl)"
             if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))):
                 inLocalControl_WES = True
                 infofield["LocalControl_WES"] = "LocalControl_WES"
-                reasons += "LocalControl_WES(NoControl)"
+                #reasons += "LocalControl_WES(NoControl)"
 
 
         # Punish for biases round 1
@@ -312,7 +316,7 @@ def main(args):
                 filterfield["BI"] = 1
 
         # Only for hg19 reference genome
-        if args.refgenome[0] == "hs37d5":
+        if args.refgenome[0] == "hs37d5" and not args.skipREMAP:
             # 2) annotations of regions that cause problems: some classes of repeats from RepeatMasker track,
             # segmental duplications, (cf. Reumers et al. 2012, Nature Biotech 30:61), external blacklists, mapability
             # simple repeats and low complexity (not the same as homopolymer, but similar enough);
@@ -397,6 +401,17 @@ def main(args):
 
                     filterfield["MAP"] = 1
                     reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce)
+        # Nov 2022: If the variant is present in the gnomAD or local control WGS
+        # the confidence are reduced and thrown out. This is implemented for hg38 and could be
+        # used with skipREMAP option for hg19.
+        # inLocalControl_WES: Needs to be generated from a new hg38 dataset
+        filterfield["FREQ"] = 0
+        if(args.refgenome[0] == 'GRCh38' or args.skipREMAP):
+            if(inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS):
+                reasons += 'commonSNP_or_technicalArtifact(-3)'
+                classification = "SNP_support_germline"
+                confidence -= 3
+                filterfield["FREQ"] = 1
 
         # if others have found the SNP already, it may be interesting despite low score
         # - but only if it's not a weird region.
@@ -590,7 +605,9 @@ def main(args):
                         reasons += "Germline_ALT<0.3(-2)"
                         filterfield["FRC"] = 1
                 if in1KG or (indbSNP and not (is_precious or is_clinic)): # but this supports again - number of reads may be low!
-                    classification += "_SNP_support"
+                    if filterfield['FREQ'] == 0:
+                        classification += "_SNP_support"
+
                 if depthC <= 10: # very probably germline due to skewed distribution at low coverage
                     classification += "_lowCov"	# => can end up as "germline_SNP_support_lowCov"
 
@@ -684,7 +701,7 @@ def main(args):
             filters_line = [] if entries[6] == "" else entries[6].split(';')
             if args.pancanout is not None:
                 filters_pancan = []
-            for filter in ("RE","BL","DP","SB","TAC","dbSNP","DB","HSDEPTH","MAP","SBAF","FRQ","TAR","UNCLEAR","DPHIGH","DPLOWC","1PS","ALTC","ALTCFR","FRC","YALT","VAF","BI"):
+            for filter in ("RE","BL","DP","SB","TAC","dbSNP","DB","HSDEPTH","MAP","SBAF","FRQ","TAR","UNCLEAR","DPHIGH","DPLOWC","1PS","ALTC","ALTCFR","FRC","YALT","VAF","BI", "FREQ"):
                 if filterfield.get(filter, 0) == 1:
                     if args.pancanout is not None:
                         filters_pancan.append(filter)
@@ -801,6 +818,7 @@ def main(args):
     parser.add_argument("-x", "--runexome", dest="runexome", action="store_true", default=False,
                         help="Run on exome, will turn off the high control coverage punishment " \
                              "and the PCR bias filter.")
+    parser.add_argument('--skipREMAP', dest='skipREMAP', action='store_true', default=False)
     parser.add_argument("--gnomAD_WGS_maxMAF", dest="gnomAD_WGS_maxMAF", help="Max gnomAD WGS MAF", default=0.001, type=float)
     parser.add_argument("--gnomAD_WES_maxMAF", dest="gnomAD_WES_maxMAF", help="Max gnomAD WES MAF", default=0.001, type=float)
     parser.add_argument("--localControl_WGS_maxMAF", dest="localControl_WGS_maxMAF", help="Max local control WGS MAF", default=0.01, type=float)

From 56aa726ec3e7d097b9e57edf32aa39d8b747ef80 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 7 Nov 2022 13:48:35 +0100
Subject: [PATCH 37/49] Update README

---
 README.md        | 11 +++++++++++
 buildinfo.txt    |  2 +-
 buildversion.txt |  2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 48141d3..5d28343 100644
--- a/README.md
+++ b/README.md
@@ -131,6 +131,17 @@ The optional configuration JSON file defaults to the `convertToStdVCF.json` resi
 
 ## Changelog
 
+* 3.0.0
+  * Major
+    * Support for hg38/GRCh38 reference genome and variant calling from ALT and HLA contigs.
+  * Minor
+    * For hg38: Removing mappability and repeat elements' annotations from penalty calculations.
+    * skipREMAP: Option to remove repeat elements and mappability from confidence annotations in hg19.
+    * Removing EVS And ExAC AF from the annotations and no-control workflow filtering
+    * Support for variant calling from CRAM files
+    * Bug fix: Removing "quote" around the raw filter option `<RAW_SNV_FILTER_OPTIONS>`
+    * Update `COWorkflowsBasePlugin` to `1.4.2`
+
 * 2.2.0
 
   * minor: Added `vcfConvertToStd.py`; slightly modified from code by @juleskers and Sophia Stahl
diff --git a/buildinfo.txt b/buildinfo.txt
index 7730d00..df81708 100755
--- a/buildinfo.txt
+++ b/buildinfo.txt
@@ -1,4 +1,4 @@
-dependson=COWorkflowsBasePlugin:1.3.0
+dependson=COWorkflowsBasePlugin:1.4.2
 JDKVersion=1.8
 GroovyVersion=2.4
 RoddyAPIVersion=3.4
diff --git a/buildversion.txt b/buildversion.txt
index 228b97b..fd8c949 100755
--- a/buildversion.txt
+++ b/buildversion.txt
@@ -1,2 +1,2 @@
-2.1
+3.0
 0

From da45d2a91782ebca913879debfbeab43d4edc94e Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 7 Nov 2022 13:50:19 +0100
Subject: [PATCH 38/49] Exempt classification with FREQ

---
 .../analysisTools/snvPipeline/confidenceAnnotation_SNVs.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index 5663dd2..1d32592 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -409,7 +409,7 @@ def main(args):
         if(args.refgenome[0] == 'GRCh38' or args.skipREMAP):
             if(inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS):
                 reasons += 'commonSNP_or_technicalArtifact(-3)'
-                classification = "SNP_support_germline"
+                #classification = "SNP_support_germline"
                 confidence -= 3
                 filterfield["FREQ"] = 1
 

From 45fa141ae7800a7873a0b71be482350bae6550a6 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 20 Mar 2023 09:05:41 +0100
Subject: [PATCH 39/49] Reverting SNP based confidence scoring

---
 .../analysisTools/snvPipeline/confidenceAnnotation_SNVs.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index 1d32592..1cf7c1a 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -408,9 +408,9 @@ def main(args):
         filterfield["FREQ"] = 0
         if(args.refgenome[0] == 'GRCh38' or args.skipREMAP):
             if(inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS):
-                reasons += 'commonSNP_or_technicalArtifact(-3)'
+                #reasons += 'commonSNP_or_technicalArtifact(-3)'
                 #classification = "SNP_support_germline"
-                confidence -= 3
+                #confidence -= 3
                 filterfield["FREQ"] = 1
 
         # if others have found the SNP already, it may be interesting despite low score

From 5479faf3d69036c9c2aad9a03a26db1a116cca27 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 20 Mar 2023 09:06:54 +0100
Subject: [PATCH 40/49] Add exception for 'N' in createErrorPlots.py

---
 resources/analysisTools/snvPipeline/createErrorPlots.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/resources/analysisTools/snvPipeline/createErrorPlots.py b/resources/analysisTools/snvPipeline/createErrorPlots.py
index dc7dbfc..902eed2 100755
--- a/resources/analysisTools/snvPipeline/createErrorPlots.py
+++ b/resources/analysisTools/snvPipeline/createErrorPlots.py
@@ -199,6 +199,10 @@ def calculateErrorMatrix(vcfFilename, referenceFilename, errorType):
 			# 23.05.2016 JB: Excluded multiallelic SNVs
 			if ',' in split_line[header.index("ALT")]: continue
 
+			# 21.02.2023 NP: Excluded SNVs with 'N' before or after "," in context
+			if 'N,' in split_line[header.index("SEQUENCE_CONTEXT")] or ',N' in split_line[header.index("SEQUENCE_CONTEXT")]:
+				continue
+
 			chrom = split_line[header.index("CHROM")]
 			pos = int(split_line[header.index("POS")])
 			context = "" 

From f633856df579de6627a47686c47cbb7c906574cb Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 20 Mar 2023 09:08:20 +0100
Subject: [PATCH 41/49] Remove NA values in quantile calculation

---
 .../snvPipeline/tripletBased_BQDistribution_plotter.R           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R b/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R
index d57fcb5..07b6bff 100755
--- a/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R
+++ b/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R
@@ -474,7 +474,7 @@ plot_BQD_to_pdf = function(PDF_OUTPUT_FILE, data.bq.triplet,
                 molten.alt$nBases=transitionSubset[molten.alt$sample,"nBQ.alt"]
                 molten.alt[,ReadPositionsQuantile_ColName] = sapply(transitionSubset[molten.alt$sample,"RP_string.alt"], function(positionsString) {
                   positions = as.integer(unlist(strsplit(positionsString, ",")))
-                  return (round(quantile(positions, ReadPositionQuantile)))
+                  return (round(quantile(positions, ReadPositionQuantile, na.rm = TRUE)))
                 })                
                 # molten.altReadPos = melt(baseScores.alt.counts.normalized.cumul, id.vars = "sample")
                 if  (whatToPlot == "BQD_sampleIndividual_ColoredByChromosome") {

From ea7154be10a87a614be904f4f9fcfecaf7de91f0 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 20 Mar 2023 09:08:49 +0100
Subject: [PATCH 42/49] Update reference

---
 .../configurationFiles/analysisSNVCallingGRCh38.xml  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
index 58c5503..4cdb729 100644
--- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
@@ -13,15 +13,15 @@
     <cvalue name='hg38BaseDirectory' value='/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg_GRCh38' type='path' />
 
     <!-- Reference genomes -->
-    <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
-    <cvalue name='INDEX_PREFIX' value="${hg38BaseDirectory}/indexes/bwa/bwa07_GRCh38_decoy_ebv_phiX_alt_hla_chr/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
-    <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
-    <cvalue name='CHROMOSOME_LENGTH_FILE' value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLength.tsv' type="path" />
+    <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_alt_hla_phiX.fa" type="path"/>
+    <cvalue name='INDEX_PREFIX' value="${hg38BaseDirectory}/indexes/bwa/bwa07_GRCh38_decoy_ebv_phiX_alt_hla_chr/GRCh38_decoy_ebv_alt_hla_phiX.fa" type="path"/>
+    <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
+    <cvalue name='CHROMOSOME_LENGTH_FILE' value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.chrLength.tsv' type="path" />
 
     <!--Chr indices for the calling-->
     <cvalue name="CHROMOSOME_INDICES" value="( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y HLA ALT )" type="bashArray" />
-    <cvalue name="CHROMOSOME_HLA_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.HLA_contigs.bed' type="path" />
-    <cvalue name="CHROMOSOME_ALT_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.ALT_contigs.bed' type="path" />
+    <cvalue name="CHROMOSOME_HLA_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.HLA_contigs.bed' type="path" />
+    <cvalue name="CHROMOSOME_ALT_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.ALT_contigs.bed' type="path" />
 
     <!--BCFtools options fo the variant calling-->
     <cvalue name='MPILEUP_OPTS' value='"-EI -q 30 -O u --ignore-RG --ff UNMAP,SECONDARY,QCFAIL,DUP,SUPPLEMENTARY -d 9999 -a AD -x"' type="string"/>

From 3f8419cc3f5db1c1ecc20653c9d18820f9af467a Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 20 Mar 2023 09:10:42 +0100
Subject: [PATCH 43/49] Update raw_filter_punishment in accordance with
 RAW_SNV_FILTER_OPTIONS

---
 .../analysisTools/snvPipeline/confidenceAnnotation_SNVs.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
index 1cf7c1a..5c1c5a1 100755
--- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
+++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py
@@ -653,7 +653,7 @@ def main(args):
 
         #To make sure that changes in the raw filter do not influence the final result we punish them with -3
         # TODO: JB: Why we use the second decimal of orignal value here?
-        if not args.runlowmaf and ((float(entries[5]) < 3 and (float("%.2f"%fr_var_tum) < 0.05 or (tvf + tvr < 5))) or (float(entries[5]) < 20 and ((tvf == 0 or tvr == 0) and (tvf + tvr <= 3)))):
+        if not args.runlowmaf and ((float(entries[5]) < 3 and (float("%.2f"%fr_var_tum) < 0.03 or (tvf + tvr < 3))) or (float(entries[5]) < 20 and ((tvf == 0 or tvr == 0) and (tvf + tvr <= 3)))):
             confidence -= 3
             reasons += "raw_filter_punishment(-3)"
             filterfield["FRQ"] = 1

From 13bdc5f5957299b30c84a54d775247d8afcf5386 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Thu, 23 Mar 2023 09:07:48 +0100
Subject: [PATCH 44/49] Move python env

---
 .../analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh
index e1aeb9d..991202f 100755
--- a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh
+++ b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh
@@ -19,7 +19,7 @@ module load bedtools/"${BEDTOOLS_VERSION:?BEDTOOLS_VERSION undefined}"
 module load pypy/"${PYPY_VERSION:?PYPY_VERSION undefined}"
 
 set +ue
-source /dkfz/cluster/virtualenvs/paramasi/python_2.7.9_SNVCalling_pysam_0.16.0.1/bin/activate
+source /omics/groups/OE0246/shared/paramasi/compute_env/python_2.7.9_SNVCalling_pysam_0.16.0.1/bin/activate
 set -ue
 
 export BGZIP_BINARY=bgzip

From e2f7db944ebc79a4f49050ce396c373fd55fc5f4 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 12 Jun 2023 12:21:29 +0200
Subject: [PATCH 45/49] Update virtual env path

---
 .../dkfz_lsf_virtual_snv_env_packages.txt     | 19 +++++++++++++++++++
 .../environments/tbi-lsf-cluster.sh           |  2 +-
 .../configurationFiles/analysisSNVCalling.xml |  2 ++
 3 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt

diff --git a/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt b/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt
new file mode 100644
index 0000000..2399198
--- /dev/null
+++ b/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt
@@ -0,0 +1,19 @@
+Package         Version
+--------------- --------
+biopython       1.57
+Cython          0.27.3
+funcsigs        1.0.2
+matplotlib      1.4.3
+mock            2.0.0
+nose            1.3.7
+numpy           1.11.3
+pbr             3.1.1
+pip             20.3.4
+pyparsing       2.2.0
+pysam           0.16.0.1
+python-dateutil 2.6.1
+pytz            2017.3
+scipy           0.12.0
+setuptools      36.8.0
+six             1.11.0
+wheel           0.30.0
diff --git a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh
index 991202f..1584d10 100755
--- a/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh
+++ b/resources/analysisTools/snvPipeline/environments/tbi-lsf-cluster.sh
@@ -19,7 +19,7 @@ module load bedtools/"${BEDTOOLS_VERSION:?BEDTOOLS_VERSION undefined}"
 module load pypy/"${PYPY_VERSION:?PYPY_VERSION undefined}"
 
 set +ue
-source /omics/groups/OE0246/shared/paramasi/compute_env/python_2.7.9_SNVCalling_pysam_0.16.0.1/bin/activate
+source "${DKFZ_LSF_VIRTUAL_SNV_ENV}
 set -ue
 
 export BGZIP_BINARY=bgzip
diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
index 620a8ad..99c8019 100755
--- a/resources/configurationFiles/analysisSNVCalling.xml
+++ b/resources/configurationFiles/analysisSNVCalling.xml
@@ -166,6 +166,8 @@
             <cvalue name='ENCODE_TFBS' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_wgEncodeRegTfbsClusteredV3.bed.gz' type="path"/>
         </configurationValueBundle>
         <!--#>PIPE_CONFIG-->
+        
+        <cvalue name='DKFZ_LSF_VIRTUAL_SNV_ENV' value='/omics/groups/OE0246/shared/paramasi/compute_env/python_2.7.9_SNVCalling_pysam_0.16.0.1/bin/activate' type='path'>
 
     </configurationvalues>
     <processingTools>

From 44614b6a3dbffe9f4b798a7340ee6c73e0f1d367 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 12 Jun 2023 12:22:34 +0200
Subject: [PATCH 46/49] XML comments to description

---
 .../snvPipeline/PurityReloaded.py             |  2 +-
 .../snvPipeline/createErrorPlots.py           |  2 +-
 .../analysisTools/snvPipeline/filter_vcf.sh   |  2 +-
 .../analysisSNVCallingGRCh38.xml              | 26 +++++++------------
 4 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/PurityReloaded.py b/resources/analysisTools/snvPipeline/PurityReloaded.py
index b8dcb7d..f67c907 100755
--- a/resources/analysisTools/snvPipeline/PurityReloaded.py
+++ b/resources/analysisTools/snvPipeline/PurityReloaded.py
@@ -251,7 +251,7 @@ def parseVcf(file,num):
 	while (l!= ""):
 		t=l.split('\t')
 		if (t[0][0] != "#") and isValid(t):
-			# Skiping the non-primary assembly variants from purity calculations
+			# Skipping the non-primary assembly variants from purity calculations
 			if t[0].startswith('HLA') or t[0].endswith('_alt'):
 				l=vcf.readline()
 				continue
diff --git a/resources/analysisTools/snvPipeline/createErrorPlots.py b/resources/analysisTools/snvPipeline/createErrorPlots.py
index 902eed2..a528de6 100755
--- a/resources/analysisTools/snvPipeline/createErrorPlots.py
+++ b/resources/analysisTools/snvPipeline/createErrorPlots.py
@@ -200,7 +200,7 @@ def calculateErrorMatrix(vcfFilename, referenceFilename, errorType):
 			if ',' in split_line[header.index("ALT")]: continue
 
 			# 21.02.2023 NP: Excluded SNVs with 'N' before or after "," in context
-			if 'N,' in split_line[header.index("SEQUENCE_CONTEXT")] or ',N' in split_line[header.index("SEQUENCE_CONTEXT")]:
+			if {'N,', ',N'}.intersection(split_line[header.index("SEQUENCE_CONTEXT")]):
 				continue
 
 			chrom = split_line[header.index("CHROM")]
diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh
index 2ae1534..fa23694 100755
--- a/resources/analysisTools/snvPipeline/filter_vcf.sh
+++ b/resources/analysisTools/snvPipeline/filter_vcf.sh
@@ -24,7 +24,7 @@ getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFI
 ALIGNMENT_FOLDER=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}`
 
 # bugfix: ensure to interpret CHROMOSOME_INDICES as array - otherwise TOOL_INTERMUTATION_DISTANCE_COORD_COLOR will fail...
-declare -a CHROMOSOME_INDICES="${CHROMOSOME_INDICES_PLOTTING}"
+declare -a CHROMOSOME_INDICES="${CHROMOSOME_INDICES_PLOTTING:-${CHROMOSOME_INDICES}}"
 numberOfChromosomes=${CHROMOSOME_INDICES[@]}
 outputFilenamePrefix=${mpileupDirectory}/${SNVFILE_PREFIX}${PID}
 outputFilenamePrefix_original=${outputFilenamePrefix}
diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
index 4cdb729..96d2dcf 100644
--- a/resources/configurationFiles/analysisSNVCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml
@@ -9,31 +9,26 @@
 
   <configurationvalues>
 
-    <!--hg38 base directory-->
-    <cvalue name='hg38BaseDirectory' value='/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg_GRCh38' type='path' />
+    <cvalue name='hg38BaseDirectory' value='/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg_GRCh38' type='path' description="hg38 base directory"/>
 
-    <!-- Reference genomes -->
-    <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_alt_hla_phiX.fa" type="path"/>
+    <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_alt_hla_phiX.fa" type="path" description="Reference genomes"/>
     <cvalue name='INDEX_PREFIX' value="${hg38BaseDirectory}/indexes/bwa/bwa07_GRCh38_decoy_ebv_phiX_alt_hla_chr/GRCh38_decoy_ebv_alt_hla_phiX.fa" type="path"/>
     <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
     <cvalue name='CHROMOSOME_LENGTH_FILE' value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.chrLength.tsv' type="path" />
 
-    <!--Chr indices for the calling-->
-    <cvalue name="CHROMOSOME_INDICES" value="( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y HLA ALT )" type="bashArray" />
-    <cvalue name="CHROMOSOME_HLA_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.HLA_contigs.bed' type="path" />
-    <cvalue name="CHROMOSOME_ALT_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.ALT_contigs.bed' type="path" />
+    <cvalue name="CHROMOSOME_INDICES" value="( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y HLA ALT )" type="bashArray" description="Chr indices for the calling"/>
+    <cvalue name="CHROMOSOME_HLA_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.HLA_contigs.bed' type="path" description="HLA contig list"/>
+    <cvalue name="CHROMOSOME_ALT_CONTIGS_FILE" value='${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.ALT_contigs.bed' type="path" description="ALT contig list"/>
 
-    <!--BCFtools options fo the variant calling-->
-    <cvalue name='MPILEUP_OPTS' value='"-EI -q 30 -O u --ignore-RG --ff UNMAP,SECONDARY,QCFAIL,DUP,SUPPLEMENTARY -d 9999 -a AD -x"' type="string"/>
-    <cvalue name='MPILEUP_OPTS_ALT_HLA' value='"-A -EI -q 30 -O u --ignore-RG --ff UNMAP,SECONDARY,QCFAIL,DUP -d 9999 -a AD -x"' type="string"/>
-    <cvalue name='BCFTOOLS_OPTS' value='"-vc -p 2.0 --ploidy GRCh38"' type="string"/>
+    <cvalue name='MPILEUP_OPTS' value='"-EI -q 30 -O u --ignore-RG --ff UNMAP,SECONDARY,QCFAIL,DUP,SUPPLEMENTARY -d 9999 -a AD -x"' type="string" description="Mpileup options for primary chrs"/>
+    <cvalue name='MPILEUP_OPTS_ALT_HLA' value='"-A -EI -q 30 -O u --ignore-RG --ff UNMAP,SECONDARY,QCFAIL,DUP -d 9999 -a AD -x"' type="string" description="Mpileup options for ALT/HLA contigs"/>
+    <cvalue name='BCFTOOLS_OPTS' value='"-vc -p 2.0 --ploidy GRCh38"' type="string" description="BCFtools call options for GRCh38/hg38" />
 
     <cvalue name='CHR_PREFIX' value='chr' type='string' />
     <cvalue name="disableAutoBAMHeaderAnalysis" value="true" type="boolean"/>
 
-    <!--## Databases -->
-    <cvalue name='hg38DatabasesDirectory' value="${hg38BaseDirectory}/databases" type='path' />
-    <cvalue name='hg38DatabaseUCSCDirectory' value="${hg38DatabasesDirectory}/UCSC" type='path' />
+    <cvalue name='hg38DatabasesDirectory' value="${hg38BaseDirectory}/databases" type='path' description="hg38-based annotations database directory"/>
+    <cvalue name='hg38DatabaseUCSCDirectory' value="${hg38DatabasesDirectory}/UCSC" type='path' description="hg38-based UCSC annotation directory"/>
 
     <cvalue name="DBSNP" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
     <cvalue name="DBSNP_COL" value="DBSNP" type="string"/>
@@ -53,7 +48,6 @@
     <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_Sept2022/humandb/' type='path'/>
     <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV39'" type='string'/>
 
-    <!--Reference annotations during confidene annotation-->
     <cvalue name="CONFIDENCE_OPTS" value="-t 500 --refgenome GRCh38 ftp://ftp.sanger.ac.uk/pub/cancer/dockstore/human/GRCh38_hla_decoy_ebv/core_ref_GRCh38_hla_decoy_ebv.tar.gz" type="string"
             description="max read depth in control (-t INT default 150 for genomes, set to 500 for exomes) can now be ommited to avoid depth dependent punishing by setting -c 0. It is also possible to get a justification for the confidence score by setting the -p 1 flag"/>
 

From 3a47349b601ab9b51f5f821fd4e7829e032b86f7 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 12 Jun 2023 12:55:00 +0200
Subject: [PATCH 47/49] Minor update

---
 README.md                                     |  2 +-
 .../dkfz_lsf_virtual_snv_env_packages.txt     | 19 -------------------
 .../configurationFiles/analysisSNVCalling.xml |  2 +-
 3 files changed, 2 insertions(+), 21 deletions(-)
 delete mode 100644 resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt

diff --git a/README.md b/README.md
index 0fba5fa..a1b5301 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # DKFZ SNVCalling Workflow
 
-An SNV calling workflow developed in the Applied Bioinformatics and Theoretical Bioinformatics groups at the DKFZ. An earlier version (pre Github) of this workflow was used in the [Pancancer](https://dockstore.org/containers/quay.io/pancancer/pcawg-dkfz-workflow) project. The workflow is only suited for human data (hg37/hg19; some later versions also hg38), because of the big role annotations play in this workflow.
+An SNV calling workflow developed in the Applied Bioinformatics and Theoretical Bioinformatics groups at the DKFZ. An earlier version (pre Github) of this workflow was used in the [Pancancer](https://dockstore.org/containers/quay.io/pancancer/pcawg-dkfz-workflow) project. The workflow is only suited for human data (hg37/GRCh37/hg19; and also for hg38/GRCh38), because of the big role annotations play in this workflow.
 
 > <table><tr><td><a href="https://www.denbi.de/"><img src="docs/images/denbi.png" alt="de.NBI logo" width="300" align="left"></a></td><td><strong>Your opinion matters!</strong> The development of this workflow is supported by the <a href="https://www.denbi.de/">German Network for Bioinformatic Infrastructure (de.NBI)</a>. By completing <a href="https://www.surveymonkey.de/r/denbi-service?sc=hd-hub&tool=SNVCallingWorkflow">this very short (30-60 seconds) survey</a> you support our efforts to improve this tool.</td></tr></table>
 
diff --git a/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt b/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt
deleted file mode 100644
index 2399198..0000000
--- a/resources/analysisTools/snvPipeline/environments/dkfz_lsf_virtual_snv_env_packages.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-Package         Version
---------------- --------
-biopython       1.57
-Cython          0.27.3
-funcsigs        1.0.2
-matplotlib      1.4.3
-mock            2.0.0
-nose            1.3.7
-numpy           1.11.3
-pbr             3.1.1
-pip             20.3.4
-pyparsing       2.2.0
-pysam           0.16.0.1
-python-dateutil 2.6.1
-pytz            2017.3
-scipy           0.12.0
-setuptools      36.8.0
-six             1.11.0
-wheel           0.30.0
diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml
index e620f49..c337644 100755
--- a/resources/configurationFiles/analysisSNVCalling.xml
+++ b/resources/configurationFiles/analysisSNVCalling.xml
@@ -11,7 +11,7 @@
         <!-- The following options are only relevant for the ODCF cluster at the DKFZ -->
         <cvalue name='workflowEnvironmentScript' value="workflowEnvironment_tbiLsf" type='string'/>
         <cvalue name='snvAnnotationEnvironmentScript' value="snvAnnotationEnvironment_tbiLsf" type="string"/>
-        <cvalue name='tbiLsfVirtualEnvDir' value="/omics/groups/OE0246/shared/paramasi/compute_env/python_2.7.9_SNVCalling_pysam_0.16.0.1" type="path"
+        <cvalue name='tbiLsfVirtualEnvDir' value="" type="path"
                 description="The path on the execution hosts to the virtualenv used in the tbi-lsf-cluster."/>
 
         <cvalue name='RSCRIPT_VERSION' value='3.3.1' type="string"/>

From b8d46649c39587b005d6b76bda3bd061ea4e4f89 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 12 Jun 2023 13:50:29 +0200
Subject: [PATCH 48/49] Update readme with hg38 calls

---
 README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/README.md b/README.md
index a1b5301..a94901d 100644
--- a/README.md
+++ b/README.md
@@ -114,6 +114,8 @@ Please have a look at the `resources/configurationFiles/analysisSNVCalling.xml`
 
 ## Example Call
 
+For hg19,
+
 ```bash
 roddy.sh run projectConfigurationName@analysisName patientId \
   --useconfig=/path/to/your/applicationProperties.ini \
@@ -123,6 +125,22 @@ roddy.sh run projectConfigurationName@analysisName patientId \
   --cvalues="bamfile_list:/path/to/your/control.bam;/path/to/your/tumor.bam,sample_list:normal;tumor,possibleTumorSampleNamePrefixes:tumor,possibleControlSampleNamePrefixes:normal,REFERENCE_GENOME:/reference/data/hs37d5_PhiX.fa,CHROMOSOME_LENGTH_FILE:/reference/data/hs37d5_PhiX.chromSizes,extractSamplesFromOutputFiles:false"
 ```
 
+For hg38, add the following workflow configs to the `projectConfigs`
+
+```xml
+  <availableAnalyses>
+    <analysis id="snvCalling" configuration="snvCallingAnalysisGRCh38" useplugin="SNVCallingWorkflow:3.0.0"/>
+  </availableAnalyses>
+```
+
+```bash
+roddy.sh run projectConfigurationName@analysisName patientId \
+  --useconfig=/path/to/your/applicationProperties.ini \
+  --configurationDirectories=/path/to/your/projectConfigs \
+  --useiodir=/input/directory,/output/directory/snv \
+  --cvalues="bamfile_list:/path/to/your/control.bam;/path/to/your/tumor.bam,sample_list:normal;tumor,possibleTumorSampleNamePrefixes:tumor,possibleControlSampleNamePrefixes:normal,REFERENCE_GENOME:/reference/data/GRCh38_decoy_ALT_HLA_PhiX.fa,CHROMOSOME_LENGTH_FILE:/reference/data/GRCh38_decoy_ALT_HLA_PhiX.chromSizes,extractSamplesFromOutputFiles:false"
+```
+
 ### No Control
 
 * Set the parameter `isNoControlWorkflow` to `true`.

From 4dca6367a9dfa87ec9d1f464037d353b93b1d760 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Tue, 27 Jun 2023 16:33:54 +0200
Subject: [PATCH 49/49] Fixing the missed conflict

---
 .../snvPipeline/filter_PEoverlap.py           | 476 ------------------
 1 file changed, 476 deletions(-)

diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
index 14e016a..6d19fcf 100755
--- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py
+++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py
@@ -1,4 +1,3 @@
-<<<<<<< HEAD
 #!/usr/bin/env python
 #
 # Copyright (c) 2018 German Cancer Research Center (DKFZ).
@@ -477,478 +476,3 @@ def performAnalysis(args):
     performAnalysis(args)
     #print "\nProgram successfully terminating...."
 
-=======
-#!/usr/bin/env python
-#
-# Copyright (c) 2018 German Cancer Research Center (DKFZ).
-#
-# Distributed under the MIT License (https://opensource.org/licenses/MIT).
-#
-
-
-# python /home/jaegern/pyWorkspace/NGS_Read_Processing/src/filter_PEoverlap.py --inf=snvs_108031.vcf --alignmentFile=/icgc/lsdf/mb/analysis/medullo/adultMB/results_per_pid/108031/alignment/tumor_108031_merged.bam.rmdup.bam --outf=snvs_108031_PEoverlapFiltered.vcf
-# more snvs_108031.vcf | python /home/jaegern/pyWorkspace/NGS_Read_Processing/src/filter_PEoverlap.py --alignmentFile=/icgc/lsdf/mb/analysis/medullo/adultMB/results_per_pid/108031/alignment/tumor_108031_merged.bam.rmdup.bam --outf=snvs_108031_PEoverlapFiltered_nonALT_FINAL.vcf
-
-
-import platform
-if platform.python_implementation() == "PyPy":
-    import copysam as pysam
-else:  # "CPython"
-    import pysam
-
-import sys, os
-from vcfparser import *
-import re
-
-def listToTabsep(listItems, sep='\t'):
-    return sep.join(listItems)
-
-def qualFromASCII(ch):
-    return(ord(ch) - qualScoreOffset)
-
-
-def transformQualStr(s):
-    return map(qualFromASCII,s)
-
-# Routine for getting index of ACGTNacgtn lists
-def getIndexACGTNacgtn(is_reverse, is_read1, base):
-    if (is_reverse):
-        if(is_read1):
-            if(base == "a"):
-                return ["minus", 5]
-            elif(base == "c"):
-                return ["minus", 6]
-            elif(base == "g"):
-                return ["minus", 7]
-            elif(base == "t"):
-                return ["minus", 8]
-            elif(base == "n"):
-                return ["minus", 9]
-        else:
-            if(base == "a"):
-                return ["plus", 5]
-            elif(base == "c"):
-                return ["plus", 6]
-            elif(base == "g"):
-                return ["plus", 7]
-            elif(base == "t"):
-                return ["plus", 8]
-            elif(base == "n"):
-                return ["plus", 9]
-    else:
-        if(is_read1):
-            if(base == "a"):
-                return ["plus", 0]
-            elif(base == "c"):
-                return ["plus", 1]
-            elif(base == "g"):
-                return ["plus", 2]
-            elif(base == "t"):
-                return ["plus", 3]
-            elif(base == "n"):
-                return ["plus", 4]
-        else:
-            if(base == "a"):
-                return ["minus", 0]
-            elif(base == "c"):
-                return ["minus", 1]
-            elif(base == "g"):
-                return ["minus", 2]
-            elif(base == "t"):
-                return ["minus", 3]
-            elif(base == "n"):
-                return ["minus", 4]
-
-def decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar):
-    if remove_base.lower() == REF.lower():
-        if remove_is_reverse:
-            # check if none of the 4 DP4 values are < 0 now, which can happen due to BAQ values instead of original base qualities, which are not part of the BAM file
-            if DP4rr > 0: DP4rr -= 1
-        else:
-            if DP4rf > 0: DP4rf -= 1
-    elif remove_base.lower() == ALT.lower():
-        if remove_is_reverse:
-            if DP4ar > 0: DP4ar -= 1
-        else:
-            if DP4af > 0: DP4af -= 1   
-    return(DP4rf, DP4rr, DP4af, DP4ar)
-
-class BoolCounter:
-    """ A class for counter objects """ 
-
-    def __init__(self, ref_base, alt_base):
-        self.ref_base = ref_base
-        self.alt_base = alt_base
-        self.ref_count = 0
-        self.alt_count = 0
-
-    def update(self, current_base):
-        if self.ref_base == current_base:
-            self.ref_count += 1
-        elif self.alt_base == current_base:
-            self.alt_count += 1
-
-    def update_nonREFnonALT(self, count):
-        self.alt_count =+ count
-
-    def counted(self):
-        return [self.ref_count, self.alt_count]
-
-# MAIN ANALYSIS PROCEDURE
-def performAnalysis(args):
-    global qualScoreOffset
-
-    # http://stackoverflow.com/questions/881696/unbuffered-stdout-in-python-as-in-python-u-from-within-the-program
-    unbuffered = os.fdopen(sys.stdout.fileno(), 'w', 0)
-    sys.stdout = unbuffered
-
-    if args.qualityScore == 'illumina': qualScoreOffset = 64
-    elif args.qualityScore == 'phred': qualScoreOffset = 33
-
-    #vcfInFile = open(args.inf, "r")
-    #outFile = open(args.outf, "w")
-
-    # Reference file for BAQ_recalcuation and local realignment
-    reference_file = pysam.Fastafile(args.refFileName)
-
-    mode = "r"
-    multiple_iterators = False
-    # Setting pysam read mode based on the file extension
-    if args.alignmentFile.split(".")[-1] == "bam":
-        mode += "b"
-    elif args.alignmentFile.split(".")[-1] == "cram":
-        mode += "c"
-    samfile = pysam.Samfile(args.alignmentFile, mode)  # This should work for BAM file only (with random access).
-
-    if args.altPosF != '':
-        ALT_basePositions_file = args.altPosF
-
-    if args.refPosF != '':
-        REF_basePositions_file = args.refPosF
-
-    if args.altBQF != '':
-        ALT_baseQualities_file = args.altBQF
-
-    if args.refBQF != '':
-        REF_baseQualities_file = args.refBQF
-
-
-    for line in sys.stdin:  #   vcfInFile
-        if line[0] == "#":
-            headers = list(line[1:].rstrip().split('\t'))
-            fixed_headers = ["^CHROM$", "^POS$", "^REF$", "^ALT$", "^INFO$", ]
-            if args.no_control:
-                fixed_headers += ["^CONFIDENCE$", "^RECLASSIFICATION$", ]
-            else:
-                fixed_headers += ["^ANNOTATION_control", ]
-            header_indices = get_header_indices(headers, args.configfile, fixed_headers)
-            sys.stdout.write(line)
-            continue       # skip header from analysis
-
-        entries = line.strip().split('\t')
-        parsed_line = LineParser(entries, header_indices)
-
-        nonREFnonALTfwd=0
-        nonREFnonALTrev=0
-        ALTcount=0
-
-        ALT_basePositions=[]
-        REF_basePositions=[]
-        REF_baseQualities=[]
-        ALT_baseQualities=[]
-
-        # how to treat multiallelic SNVs? Skipped in this current version...                
-        if ((args.no_control and int(parsed_line["CONFIDENCE"]) > 7 and "somatic" in parsed_line["RECLASSIFICATION"]) or (not args.no_control and "somatic" in parsed_line["ANNOTATION_control"])) and len(parsed_line["ALT"]) == 1:
-            # DP=13;AF1=0.5;AC1=1;DP4=2,3,3,4;MQ=37;FQ=75;PV4=1,1,1,1
-            info_values = parsed_line["INFO"].split(';')
-            for info_idx, info_value in enumerate(info_values):
-                if info_value[:4] == "DP4=":
-                    DP4_idx = info_idx
-                    DP4 = map(int, info_value[4:].split(','))
-                    DP4rf, DP4rr, DP4af, DP4ar = DP4
-                    DP4_original = re.sub('DP4', 'DP4original', info_value) # Keeping a backup of original DP4
-                    DP4_original_alt = DP4af + DP4ar                    
-                    break
-
-            chrom=parsed_line["CHROM"]
-            pos=int(parsed_line["POS"])
-            REF=parsed_line["REF"]
-            ALT=parsed_line["ALT"]
-            
-            readNameHash={}
-            readMateHash={} # Hash to store read and mate starting positions for duplicate marking
-            readMateHash_qnameLocation={} # Hash to store the location of gname in the above hash list
-
-
-            ACGTNacgtn1 = [0]*10
-            ACGTNacgtn2 = [0]*10
-            count_PE = BoolCounter(REF, ALT) # Starting the counter for the forward and reverse reads removed due to PE overlap detection  
-            count_supple = BoolCounter(REF, ALT) # "" for supplementary reads, since flag_filter is added, entire supplementary detection can be removed in future versions
-            count_mismatch = BoolCounter(REF, ALT) # " for mismatch report 
-            count_nonREFnonALT = BoolCounter(REF, ALT) # " to count the non-ref and non-alt base at POS
-
-            # To match pysam and mpileup counts, a reference file is added. Given the reference file, Pysam by default computes BAQ (compute_baq).
-            for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=3844, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators):
-                if pileupcolumn.pos == (pos-1):                	
-                    #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments)                    
-                    for pileupread in pileupcolumn.pileups:                    	
-                        if pileupread.is_del:                            
-                            # 31 May 2016 JB: deletion at the pileup position                            
-                            continue                        
-                        baseScore = transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0]
-                        readpos = pileupread.query_position
-                        if pileupread.alignment.seq[pileupread.query_position].lower()  == ALT.lower():                            
-                            if args.altBQF != '':
-                                ALT_baseQualities.append(baseScore)
-                            if args.altPosF != '':
-                                if pileupread.alignment.is_reverse:
-                                    readlength = len(pileupread.alignment.seq)
-                                    readpos = (readlength - readpos)
-                                ALT_basePositions.append(readpos)
-                        if pileupread.alignment.seq[pileupread.query_position].lower()  == REF.lower():
-                            if args.refBQF != '':
-                                REF_baseQualities.append(baseScore)
-                            if args.refPosF != '':
-                                if pileupread.alignment.is_reverse:
-                                    readlength = len(pileupread.alignment.seq)
-                                    readpos = (readlength - readpos)
-                                REF_basePositions.append(readpos)
-
-                        if pileupread.alignment.mapq >= args.mapq:
-                            # http://wwwfgu.anat.ox.ac.uk/~andreas/documentation/samtools/api.html   USE qqual
-                            try:
-                                if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq:                                    
-                                    # check if we consider this read as a proper read in terms of number of mismatches
-                                    if args.allowedNumberOfMismatches > -1:
-                                        numberOfMismatches = None
-                                        for tag in pileupread.alignment.tags:
-                                            if tag[0] == "NM":
-                                                numberOfMismatches = tag[1]
-                                                break 
-                                            else:
-                                                continue                                    
-                                        
-                                        if numberOfMismatches is not None:
-                                            if numberOfMismatches > args.allowedNumberOfMismatches:
-                                                remove_base = pileupread.alignment.seq[pileupread.query_position]
-                                                remove_is_reverse = pileupread.alignment.is_reverse
-                                                count_mismatch.update(remove_base)
-                                                (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)                                         
-                                                # after decreasing the respective DP4 value, go directly to the next read
-                                                # without remembering the current read
-                                                # This will lead to an unknown read name when the paired read occurs at the same
-                                                # position. As we have already discarded the current high-mismatch read, we do not
-                                                # have to decrease DP4 values again, when the read partner occurs at the same SNV.
-                                                # We also do not increase ANCGTNacgtn for the discarded read.
-                                                continue 
-                                                                            
-                                    # Check if pileupread.alignment is proper pair
-                                    if(pileupread.alignment.is_proper_pair):
-                                        # count to ACGTNacgtn list                                        
-                                        is_reverse = pileupread.alignment.is_reverse
-                                        is_read1 = pileupread.alignment.is_read1
-                                        base = pileupread.alignment.seq[pileupread.query_position].lower()
-                                        ACGTNacgtn_index = getIndexACGTNacgtn(is_reverse, is_read1, base)
-                                        if(ACGTNacgtn_index[0] == "plus"):
-                                            ACGTNacgtn1[ACGTNacgtn_index[1]] += 1
-                                        else:
-                                            ACGTNacgtn2[ACGTNacgtn_index[1]] += 1
- 
-                                        #if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq:        # DEBUG July 23 2012: BROAD BAM problem due to pileupread.alignment.qqual being shorter sometimes than pileupread.alignment.qual
-                                        if(pileupread.alignment.query_name in readNameHash):
-                                            #print pileupread.alignment.query_name
-                                            old_qual = readNameHash[pileupread.alignment.query_name][0]
-                                            old_base = readNameHash[pileupread.alignment.query_name][1]
-                                            old_is_reverse = readNameHash[pileupread.alignment.query_name][2]
-                                            old_read_mate_tuple = readNameHash[pileupread.alignment.query_name][3]
-                                            current_qual = transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0]
-                                            current_base = pileupread.alignment.seq[pileupread.query_position]
-                                            current_is_reverse = pileupread.alignment.is_reverse
-                                            current_read_mate_tuple = (pileupread.alignment.reference_id, pileupread.alignment.reference_start, pileupread.alignment.reference_end, pileupread.alignment.next_reference_id, pileupread.alignment.next_reference_start)
-                                            # if read name occurs twice for one variant, then due to overlapping PE reads, then subtract variant count from DP4 field
-                                            # if old_base is not equal to new_base remove the one with the smaller base quality
-                                            remove_base = None
-                                            remove_is_reverse = None
-                                            if(not(old_base == current_base)):
-                                                if(old_qual <= current_qual):
-                                                    remove_base = old_base
-                                                    remove_is_reverse = old_is_reverse
-                                                    remove_old = True
-                                                else:
-                                                    remove_base = current_base
-                                                    remove_is_reverse = current_is_reverse
-                                                    remove_old = False
-                                            else:
-                                                remove_base = current_base
-                                                remove_is_reverse = current_is_reverse
-                                                remove_old = False
-                                            
-                                            count_PE.update(remove_base)
-                                            (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)                                                                                        
-                                            # If current base is better, then removing the information about old mate
-                                            # If current base is not good, then do nothing
-                                            if remove_old:
-                                            	old_location = readMateHash_qnameLocation[pileupread.alignment.query_name]
-                                            	del readMateHash[old_read_mate_tuple][old_location]
-                                            	read_mate_tuple_value = (current_qual, (current_base, current_is_reverse, pileupread.alignment.query_name, pileupread.alignment.is_supplementary))
-                                            	if current_read_mate_tuple in readMateHash:
-                                            		readMateHash[current_read_mate_tuple].append(read_mate_tuple_value)
-                                            	else:
-                                            		readMateHash[current_read_mate_tuple] = []
-                                            		readMateHash[current_read_mate_tuple].append(read_mate_tuple_value)
-
-                                        else:
-                                            # Store base quality, base, and read direction in readNameHash
-                                            base_qual_score=transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0]
-                                            read_mate_tuple = (pileupread.alignment.reference_id, pileupread.alignment.reference_start, pileupread.alignment.reference_end, pileupread.alignment.next_reference_id, pileupread.alignment.next_reference_start)
-                                            read_mate_tuple_value = (base_qual_score, (pileupread.alignment.seq[pileupread.query_position], pileupread.alignment.is_reverse, pileupread.alignment.query_name, pileupread.alignment.is_supplementary))
-
-                                            readNameHash[pileupread.alignment.query_name] = [base_qual_score, pileupread.alignment.seq[pileupread.query_position], pileupread.alignment.is_reverse, read_mate_tuple]
-
-                                            if read_mate_tuple in readMateHash:
-                                            	readMateHash[read_mate_tuple].append(read_mate_tuple_value)
-                                            else:
-                                            	readMateHash[read_mate_tuple] = []
-                                            	readMateHash[read_mate_tuple].append(read_mate_tuple_value)
-
-                                            readMateHash_qnameLocation[pileupread.alignment.query_name] = len(readMateHash[read_mate_tuple]) - 1 # Location of the last pushed element in the array                                            
-
-                            except IndexError:
-                                "soft-clipped or trimmed base, not part of the high-qual alignemnt anyways, skip"
-
-                            if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq:
-                            
-                                if pileupread.alignment.seq[pileupread.query_position] == ALT:
-                                    ALTcount += 1                                
-                                # samtools mpileup sometimes counts bases as variants which are neither REF nor ALT
-                                if (pileupread.alignment.seq[pileupread.query_position] != REF) and (pileupread.alignment.seq[pileupread.query_position] != ALT):                                    
-                                    if pileupread.alignment.is_reverse:                                    	
-                                        nonREFnonALTrev += 1
-                                        #if DP4ar > 0: DP4ar -= 1
-                                    else:
-                                        nonREFnonALTfwd += 1
-                                        #if DP4af > 0: DP4af -= 1
-
-                    if (len(REF_baseQualities) > 0):
-                        VAF =  1.0 * len(ALT_baseQualities) / (len(REF_baseQualities) + len(ALT_baseQualities))
-                    elif (len(ALT_baseQualities) > 0):
-                        VAF = 1.0
-                    else:
-                        VAF = 0.0
-
-                    if args.altBQF != '':
-                        scoreString = ",".join([str(score) for score in ALT_baseQualities])
-                        if scoreString != '':
-                            scoreString = ";".join([scoreString , str(VAF)])
-                            ALT_baseQualities_file.write("%s\t%s\t%s\n" % (chrom, pos, scoreString))
-
-                    if args.refBQF != '':
-                        scoreString = ",".join([str(score) for score in REF_baseQualities])
-                        if scoreString != '':
-                            scoreString = ";".join([scoreString , str(VAF)])
-                            REF_baseQualities_file.write("%s\t%s\t%s\n" % (chrom, pos, scoreString))
-
-                    if args.altPosF != '':
-                        positionsString = ",".join([str(readpos) for readpos in ALT_basePositions])
-                        if positionsString != '':
-                            ALT_basePositions_file.write("%s\t%s\t%s\n" % (chrom, pos, positionsString))
-
-                    if args.refPosF != '':
-                        positionsString = ",".join([str(readpos) for readpos in REF_basePositions])
-                        if positionsString != '':
-                            REF_basePositions_file.write("%s\t%s\t%s\n" % (chrom, pos, positionsString))
-
-                    break # only one pileup for a position
-
-            # Calculating duplicates based on read-mate pair's start positions (chr id and start location)
-            count_duplicate = BoolCounter(REF, ALT)
-
-            for key in readMateHash:
-            	value_length = len(readMateHash[key])            	            	
-            	if value_length > 0:            		
-            	    sorted_values = sorted(readMateHash[key], key=lambda x: x[0]) # Sorted based on base quality
-                    sorted_values = sorted_values[:-1] # removing the read with highest quality, so it will be retained for count
-                    for value in sorted_values:        # Removing everthing else                            
-                        qual_value, decreaseInfo = value
-                        remove_base = decreaseInfo[0]
-                        remove_is_reverse = decreaseInfo[1]
-
-                        count_duplicate.update(remove_base)
-                        (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar)
-
-            if (DP4[2] + DP4[3]) > ALTcount:    # that the ALTcount is larger  happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling                
-                if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd
-
-                if DP4ar >= nonREFnonALTrev: DP4ar -= nonREFnonALTrev
-
-                nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd
-                count_nonREFnonALT.update_nonREFnonALT(nonREFnonALT)
-
-            #Format: DP2 -> "reference(forward + reverse), alt(forward + reverse)"
-            supple_dup_str = 'DP2sup=' + ','.join(map(str, count_supple.counted()))
-            supple_dup_str += ';DP2dup=' + ','.join(map(str, count_duplicate.counted()))
-            supple_dup_str += ';DP2pairEnd=' + ','.join(map(str, count_PE.counted()))
-            supple_dup_str += ';DP2mis=' + ','.join(map(str, count_mismatch.counted()))
-            supple_dup_str += ';DP2nonREFnonALT=' + ','.join(map(str, count_nonREFnonALT.counted()))
-
-            ACGTNacgtn1_string = "ACGTNacgtnPLUS="+",".join([str(i) for i in ACGTNacgtn1])
-            ACGTNacgtn2_string = "ACGTNacgtnMINUS="+",".join([str(i) for i in ACGTNacgtn2])
-
-            info_values[DP4_idx] = "DP4=" + str(DP4rf)+ "," + str(DP4rr)+ "," + str(DP4af)+ "," + str(DP4ar)            
-            info_values.append(ACGTNacgtn1_string)
-            info_values.append(ACGTNacgtn2_string)
-            info_values.append(DP4_original)
-            info_values.append(supple_dup_str)            
-
-            entries[header_indices["INFO"]] = ';'.join(info_values)
-            
-            sys.stdout.write('\t'.join(entries) + '\n')
-        else:
-            sys.stdout.write(line)   # write germline and somatic-multiallelic SNVs as is
-
-        del ALT_basePositions
-        del REF_basePositions
-        del REF_baseQualities
-        del ALT_baseQualities
-    samfile.close()
-
-    if args.altBQF is not None:
-        ALT_baseQualities_file.close()
-
-    if args.refBQF is not None:
-        REF_baseQualities_file.close()
-
-    if args.altPosF is not None:
-        ALT_basePositions_file.close()
-
-    if args.refPosF is not None:
-        REF_basePositions_file.close()
-
-    #vcfInFile.close()
-    #outFile.close()
-    
-if __name__ == '__main__':
-    #print "Starting program...\n" 
-    import argparse
-    parser = argparse.ArgumentParser()
-    #parser.add_option('--inf',action='store',type='string',dest='inf',help='Specify the name of the input vcf file containing all snvs (germline and somatic)',default='')
-    parser.add_argument('--alignmentFile',dest='alignmentFile',help='Specify the name of the BAM file containing bwa alignments, has to be the BAM file that was used to call the variants in the input vcf file - REQUIRED', required=True)
-    #parser.add_option('--outf',action='store',type='string',dest='outf',help='Specify the name of the output file, which will have same format as input vcf but with PE overlap filtered DP4 values if somatic and if snvs in PE overlap region',default='')
-    parser.add_argument('--referenceFile', dest='refFileName', help='Specify the full path of reference genome file in fasta format, with index in the same directory')
-    parser.add_argument('--mapq',type=int,dest='mapq',help='Specify the minimum mapping quality of bwa used for mpileup as parameter -q (default: 30 )',default=30)
-    parser.add_argument('--baseq',type=int,dest='baseq',help='Specify the minimum base quality scores used for mpileup as parameter -Q (default: 13)',default=13)
-    parser.add_argument('--qualityScore',dest='qualityScore',help='Specify whether the per base  quality score is given in phred or illumina format (default is Illumina score: ASCII offset of 64, while PHRED scores have an ASCII offset of 33)',default='phred')
-    parser.add_argument('--maxNumberOfMismatchesInRead',type=int,dest='allowedNumberOfMismatches',help='Specify the number of mismatches that are allowed per read in order to consider this read. Value of -1 (default) turns this filter off.',default=-1)
-    parser.add_argument('--altBaseQualFile',nargs="?",type=argparse.FileType('w'),dest='altBQF',help='Specify the name of the output file for alternative allele base qualities.',default=None)
-    parser.add_argument('--refBaseQualFile',nargs="?",type=argparse.FileType('w'),dest='refBQF',help='Specify the name of the output file for reference allele base qualities.',default=None)
-    parser.add_argument('--altBasePositionsFile',nargs="?",type=argparse.FileType('w'),dest='altPosF',help='Specify the name of the output file for position within the reads for alternative bases.',default=None)
-    parser.add_argument('--refBasePositionsFile',nargs="?",type=argparse.FileType('w'),dest='refPosF',help='Specify the name of the output file for position within the reads for reference bases.',default=None)
-    parser.add_argument('--nocontrol',action='store_true',dest='no_control',help='Specify the workflow is run without control.',default=False)
-    parser.add_argument('--configFile',nargs="?",type=argparse.FileType('r'),dest='configfile',help='Specify the config file which contains header names.',default=None)
-
-
-    args = parser.parse_args()
-    performAnalysis(args)
-    #print "\nProgram successfully terminating...."
-
->>>>>>> master