Merge pull request #108 from genxnetwork/dummy_simulated_samples

Dummy simulated samples
genxnetwork · Jul 28, 2023 · fdf5742 · fdf5742
2 parents 23e423c + a997db9
commit fdf5742
Show file tree

Hide file tree

Showing 26 changed files with 661 additions and 251 deletions.
diff --git a/analysis/GRAPE Article Visualisations.ipynb b/analysis/GRAPE Article Visualisations.ipynb
diff --git a/containers/snakemake/Dockerfile b/containers/snakemake/Dockerfile
@@ -14,10 +14,15 @@ ENV SHELL /bin/bash
 RUN apt-get update && apt-get install -y wget bzip2 gnupg2 git libgomp1 libarchive13 && \
     wget -nv https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
     bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
-    rm Miniconda3-latest-Linux-x86_64.sh && \
-    conda install -c conda-forge mamba
+    rm Miniconda3-latest-Linux-x86_64.sh 
+    # conda install -n base --override-channels -c conda-forge mamba==1.4.5 'python_abi=*=*cp*'
 
-RUN for e in envs/*; do mamba env create -f $e ; done && \
+# RUN for e in envs/*; do mamba env create -f $e ; done && \
+#    conda clean --all -y
+
+RUN conda install -n base conda-libmamba-solver && \
+    conda config --set solver libmamba && \
+    conda env create -f envs/big_env.yaml && \
     conda clean --all -y
 
 # Intall Minimac3
@@ -29,10 +34,6 @@ ENV PATH "$PATH:/opt/Minimac3Executable/bin"
 # Install Minimac4
 RUN apt-get install -y minimac4
 
-# Install Eagle
-# RUN wget "https://data.broadinstitute.org/alkesgroup/Eagle/downloads/dev/eagle_v2.4.1" -O /usr/bin/eagle
-# RUN chmod a+x /usr/bin/eagle
-
 # Install Germline
 # conda version of germline has a problem with non-zero error code
 # https://github.com/gusevlab/germline/issues/8
@@ -48,6 +49,10 @@ RUN apt-get install -y make g++ git && \
 ENV PATH "$PATH:/opt/germline/bin"
 WORKDIR /
 
+# Install Eagle
+RUN wget "https://data.broadinstitute.org/alkesgroup/Eagle/downloads/dev/eagle_v2.4.1" -O /usr/bin/eagle
+RUN chmod a+x /usr/bin/eagle
+
 # Workaround of NonWritableError when conda tries to create environments for the first time
 # funnel launches docker containers with --read-only and snakemake cannot create conda envs
 # because it has to do something with urls.txt

diff --git a/envs/big_env.yaml b/envs/big_env.yaml
@@ -0,0 +1,47 @@
+name: snakemake
+channels:
+  - conda-forge
+  - bioconda
+  - biocore
+  - b3bg
+  - alex_genxt
+  - defaults
+dependencies:
+  - python>=3.9
+  - matplotlib
+  - pandas
+  - numpy
+  - seaborn
+  - biom-format
+  - scikit-bio
+  - docutils
+  - mmh3
+  - bcftools
+  - samtools
+  - plink
+  - unzip
+  - wget
+  - openssl
+  - parallel
+  - ibis
+  - openjdk
+  - picard-slim
+  - ped-sim
+  - networkx
+  - scipy
+  - statsmodels
+  - rapid-ibd
+  - scikit-allel
+  - scikit-learn
+  - king
+  - snakemake
+  - pip
+  - pip:
+    - polars
+    - pydot
+    - "--editable=git+https://github.com/alex-medvedev-msc/ersa.git#egg=ersa"
+  # - pip:
+  #   - "--editable=git+https://github.com/Jahysama/snakemake.git#egg=snakemake"
+    #Fork of a snakemake is used here because of not working conda envs inside python scripts
+    #Please check out https://github.com/snakemake/snakemake/pull/1812 for more info
+
diff --git a/launcher.py b/launcher.py
@@ -328,6 +328,27 @@ def get_parser_args():
         type=int,
         help='Random seed for Ped-sim pedigree simulation. The default value is randomly generated.')
 
+    parser.add_argument(
+        '--sim-samples-number',
+        default=1000,
+        type=int,
+        help='Number of samples to simulate in Ped-sim pedigree simulation using simbig workflow.'
+    )
+
+    parser.add_argument(
+        '--background',
+        default='1kg',
+        type=str,
+        help='Founders for simulation. The default value is 1kg. In this case, it will use 1000 genomes founders'
+    )
+
+    parser.add_argument(
+        '--augment-background',
+        default=0,
+        type=int,
+        help='How many samples to add to the background. The default value is 0. In this case, it will not add any samples to the background'
+    )
+
     args = parser.parse_args()
 
     valid_commands = [
@@ -493,8 +514,10 @@ def copy_input(input_dir, working_dir, samples_file):
     config_dict['alt_hom_samples'] = args.alt_hom_samples
     config_dict['het_samples'] = args.het_samples
     config_dict['iqr_alpha'] = args.iqr_alpha
-
+    config_dict['sim_samples_number'] = args.sim_samples_number
     config_dict['seed'] = args.seed
+    config_dict['background'] = args.background
+    config_dict['augment_background'] = args.augment_background
 
     if args.weight_mask:
         config_dict['weight_mask'] = os.path.join(args.directory, args.weight_mask)

diff --git a/rules/filter.smk b/rules/filter.smk
@@ -1,6 +1,16 @@
+rule annotate_snp_ids:
+    input:
+        vcf = 'vcf/{batch}_merged_lifted.vcf.gz'
+    output:
+        vcf = 'vcf/{batch}_merged_annotated.vcf.gz'
+    shell:
+        '''
+            bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" {input.vcf} -O z -o {output.vcf}
+        '''
+
 rule select_bad_samples:
     input:
-        vcf='vcf/{batch}_merged_lifted.vcf.gz'
+        vcf='vcf/{batch}_merged_annotated.vcf.gz'
     output:
         bad_samples='vcf/{batch}_lifted_vcf.badsamples',
         report='results/{batch}_bad_samples_report.tsv',
@@ -17,22 +27,18 @@ rule select_bad_samples:
         psc='stats/{batch}_lifted_vcf.psc',
         keep_samples='stats/{batch}_keep_samples.list',
 
-    conda:
-        'evaluation'
     script:
         '../scripts/select_bad_samples.py'
 
 
 rule plink_filter:
     input:
-        vcf='vcf/{batch}_merged_lifted.vcf.gz',
+        vcf='vcf/{batch}_merged_annotated.vcf.gz',
         bad_samples=rules.select_bad_samples.output.bad_samples
     output:
         bed = temp('plink/{batch}_merged_filter.bed'),
         bim = temp('plink/{batch}_merged_filter.bim'),
         fam = temp('plink/{batch}_merged_filter.fam')
-    conda:
-        'plink'
     params:
         input   = '{batch}_merged',
         out     = '{batch}_merged_filter',
@@ -78,8 +84,6 @@ rule plink_clean_up:
     params:
         input = 'plink/{batch}_merged_filter',
         out = 'plink/{batch}_merged_mapped'
-    conda:
-        'plink'
     log:
         'logs/plink/{batch}_plink_clean_up.log'
     benchmark:
@@ -116,8 +120,6 @@ rule prepare_vcf:
     params:
         input   = 'plink/{batch}_merged_mapped',
         vcf     = 'vcf/{batch}_merged_mapped_sorted.vcf.gz'
-    conda:
-         'bcf_plink'
     log:
         plink='logs/plink/{batch}_prepare_vcf.log',
         vcf='logs/vcf/{batch}_prepare_vcf.log'

diff --git a/rules/phasing.smk b/rules/phasing.smk
@@ -1,8 +1,23 @@
+rule index_for_eagle:
+    input:
+        bcf='vcf/{batch}_merged_mapped_sorted.bcf.gz'
+    output:
+        idx='vcf/{batch}_merged_mapped_sorted.bcf.gz.tbi'
+    log:
+        'logs/vcf/{batch}_merged_mapped_sorted.bcf.gz.tbi.log'
+    conda:
+        'bcftools'
+    shell:
+        '''
+            bcftools index -f -t {input.bcf} |& tee {log}
+        '''
+
 rule phase:
         input:
-            vcf='vcf/{batch}_merged_mapped_sorted.vcf.gz',
+            vcf='vcf/{batch}_merged_mapped_sorted.bcf.gz',
+            idx='vcf/{batch}_merged_mapped_sorted.bcf.gz.tbi',
             vcfRef=REF_VCF
-        output: temp('phase/{batch}_chr{chrom}.phased.vcf.gz')
+        output: temp('phase/{batch}_chr{chrom}.phased.bcf.gz')
         log:
             'logs/phase/{batch}_eagle-{chrom}.log'
         benchmark:
@@ -13,20 +28,20 @@ rule phase:
             --vcfTarget {input.vcf}  \
             --geneticMapFile {GENETIC_MAP} \
             --chrom {wildcards.chrom} \
-            --vcfOutFormat z \
+            --vcfOutFormat b \
             --pbwtIters 2 \
             --Kpbwt 20000 \
             --outPrefix phase/{wildcards.batch}_chr{wildcards.chrom}.phased |& tee {log}
             '''
 
 
-phase = ['chr{i}.phased.vcf.gz'.format(i=chr) for chr in CHROMOSOMES]
+phase = ['chr{i}.phased.bcf.gz'.format(i=chr) for chr in CHROMOSOMES]
 phase_batch = [ 'phase/{batch}_' + line for line in phase]
 rule merge_phased:
     input:
         phase_batch
     output:
-        'phase/{batch}_merged_phased.vcf.gz'
+        'phase/{batch}_merged_phased.bcf.gz'
     params:
         list='vcf/{batch}_phased.merge.list',
         mode=config['mode']
@@ -55,7 +70,7 @@ rule merge_phased:
         # check if there is a background data and merge it
         if [ -f "background/{wildcards.batch}_merged_imputed.vcf.gz" && {params.mode} = "client" ]; then
             mv {output} {output}.client
-            bcftools merge --force-samples background/{wildcards.batch}_merged_imputed.vcf.gz {output}.client -O z -o {output} |& tee -a {log}
+            bcftools merge --force-samples background/{wildcards.batch}_merged_imputed.vcf.gz {output}.client -O b -o {output} |& tee -a {log}
             bcftools index -f {output} |& tee -a {log}
         fi
         '''
diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk
@@ -14,8 +14,6 @@ if NUM_BATCHES > 1:
             temp(expand('vcf/batch{i}.txt',i=BATCHES))
         params:
             num_batches=NUM_BATCHES
-        conda:
-            'bcftools'
         shell:
             '''
             bcftools query --list-samples input.vcf.gz >> vcf/samples.txt
@@ -37,8 +35,6 @@ if NUM_BATCHES > 1:
             samples='vcf/{batch}.txt'
         output:
             vcf='vcf/{batch}.vcf.gz'
-        conda:
-            'bcftools'
         shell:
             '''
             bcftools view -S {input.samples} {input.vcf} -O z -o {output.vcf} --force-samples
@@ -81,8 +77,6 @@ if assembly == 'hg38':
             vcf='vcf/{batch}_imputation_removed.vcf.gz'
         output:
             vcf=temp('vcf/{batch}_merged_lifted.vcf.gz')
-        conda:
-            'liftover'
         log:
             'logs/liftover/liftover{batch}.log'
         params:
@@ -91,7 +85,7 @@ if assembly == 'hg38':
             mem_mb=_mem_gb_for_ram_hungry_jobs() * 1024
         shell:
             '''
-               JAVA_OPTS="-Xmx{params.mem_gb}g" picard LiftoverVcf WARN_ON_MISSING_CONTIG=true MAX_RECORDS_IN_RAM=5000 I={input.vcf} O={output.vcf} CHAIN={LIFT_CHAIN} REJECT=vcf/chr{wildcards.batch}_rejected.vcf.gz R={GRCH37_FASTA} |& tee -a {log}
+               JAVA_OPTS="-Xmx{params.mem_gb}g" picard -Xmx12g LiftoverVcf WARN_ON_MISSING_CONTIG=true MAX_RECORDS_IN_RAM=5000 I={input.vcf} O={output.vcf} CHAIN={LIFT_CHAIN} REJECT=vcf/chr{wildcards.batch}_rejected.vcf.gz R={GRCH37_FASTA} |& tee -a {log}
             '''
 else:
     rule copy_liftover:
@@ -110,11 +104,11 @@ if flow == 'rapid' or flow == 'germline-king':
             vcf = 'vcf/{batch}_imputation_removed.vcf.gz'
         output:
             bcf = 'vcf/{batch}_merged_mapped_sorted.bcf.gz'
-        conda:
-            'bcftools'
         shell:
             '''
-                bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" {input.vcf} | bcftools view --min-af 0.05 -O b -o {output.bcf}
+                bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" {input.vcf} | \
+                bcftools view -t ^8:10428647-13469693,21:16344186-19375168,10:44555093-53240188,22:16051881-25095451,2:85304243-99558013,1:118434520-153401108,15:20060673-25145260,17:77186666-78417478,15:27115823-30295750,17:59518083-64970531,2:132695025-141442636,16:19393068-24031556,2:192352906-198110229 | \
+                bcftools view --min-af 0.05 --types snps -O b -o {output.bcf}
             '''
 else:
     include: '../rules/filter.smk'
@@ -149,8 +143,6 @@ else:
             bcf=bcf_input
         output:
             vcf=vcf_output
-        conda:
-            'bcftools'
         shell:
             '''
                 bcftools view {input.bcf} -O z -o {output.vcf}
@@ -168,8 +160,6 @@ if not flow == 'rapid':
                 bim='preprocessed/{batch}_data.bim'
             params:
                 out='preprocessed/{batch}_data'
-            conda:
-                'plink'
             log:
                 'logs/plink/convert_mapped_to_plink{batch}.log'
             benchmark:
@@ -199,8 +189,6 @@ if not flow == 'rapid':
                 bim='preprocessed/data.bim'
             threads:
                 workflow.cores
-            conda:
-                'plink'
             shell:
                 '''
                 rm files_list.txt || true
@@ -222,8 +210,6 @@ if not flow == 'rapid':
                 batches_vcf='preprocessed/{batch}_data.vcf.gz'
             output:
                 batches_vcf_index=temp('preprocessed/{batch}_data.vcf.gz.csi')
-            conda:
-                'bcftools'
             shell:
                 '''
                 bcftools index -f {input.batches_vcf}
@@ -248,8 +234,6 @@ if not flow == 'rapid':
             params:
                 batches_vcf=expand('batch{s}_data.vcf.gz',s=BATCHES),
                 vcf='data.vcf.gz'
-            conda:
-                'bcftools'
             shell:
                 '''
                 rm complete_vcf_list.txt || true
@@ -274,8 +258,6 @@ if not flow == 'rapid':
                 bim='preprocessed/data.bim'
             params:
                 out='preprocessed/data'
-            conda:
-                'plink'
             log:
                 'logs/plink/convert_mapped_to_plink_batch1.log'
             benchmark:
@@ -291,8 +273,6 @@ if not flow == 'rapid':
             bim='preprocessed/data.bim'
         params:
             genetic_map_GRCh37=expand(GENETIC_MAP_GRCH37,chrom=CHROMOSOMES)
-        conda:
-            'ibis'
         output:
             'preprocessed/data_mapped.bim'
         log:
@@ -306,11 +286,9 @@ if not flow == 'rapid':
 else:
     rule create_samples_list:
         input:
-            bcf_input = 'phase/batch1_merged_phased.bcf.gz'
+            bcf = 'phase/batch1_merged_phased.bcf.gz'
         output:
             fam='preprocessed/data.fam'
-        conda:
-            'bcftools'
         shell:
             '''
                 bcftools query --list-samples {input.bcf} >> {output.fam}