Merge pull request #259 from nf-core/immcantation-devel

Immcantation devel
nf-core · Jul 1, 2023 · 06ab934 · 06ab934
2 parents f235c82 + 9ec2002
commit 06ab934
Show file tree

Hide file tree

Showing 32 changed files with 342 additions and 130 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -48,7 +48,8 @@ jobs:
         NXF_VER:
           - "22.10.1"
           - "latest-everything"
-        profile: ["test_tcr", "test_no_umi", "test_nocluster", "test_fetchimgt", "test_assembled"]
+        profile:
+          ["test_tcr", "test_no_umi", "test_nocluster", "test_fetchimgt", "test_assembled_hs", "test_assembled_mm"]
       fail-fast: false
     steps:
       - name: Check out pipeline code

diff --git a/.github/workflows/ci_immcantation.yml b/.github/workflows/ci_immcantation.yml
@@ -25,7 +25,12 @@ jobs:
         NXF_VER:
           - "22.10.1"
           - "latest-everything"
-        profile: ["test_assembled_immcantation_devel", "test_raw_immcantation_devel"]
+        profile:
+          [
+            "test_assembled_immcantation_devel_hs",
+            "test_assembled_immcantation_devel_mm",
+            "test_raw_immcantation_devel",
+          ]
       fail-fast: false
     steps:
       - name: Check out pipeline code

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,27 @@
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
-## [3.1] - 2023-06-05 "Protego"
+## [3.2.0dev] -
+
+### `Added`
+
+- [#268](https://github.com/nf-core/airrflow/pull/268) Added parameters for FindThreshold in `modules.config`.
+- [#268](https://github.com/nf-core/airrflow/pull/268) Validate samplesheet also for `assembled` samplesheet.
+- [#259](https://github.com/nf-core/airrflow/pull/259) Update to `EnchantR v0.1.3`.
+
+### `Fixed`
+
+- [#268](https://github.com/nf-core/airrflow/pull/268) Allows for uppercase and lowercase locus in samplesheet `pcr_target_locus`.
+- [#259](https://github.com/nf-core/airrflow/pull/259) Samplesheet only allows data from one species.
+- [#259](https://github.com/nf-core/airrflow/pull/259) Introduced fix for a too long command with hundreds of datasets.
+
+### `Dependencies`
+
+| Dependency | Old version | New version |
+| ---------- | ----------- | ----------- |
+| r-enchantr | 0.1.2       | 0.1.3       |
+
+## [3.1.0] - 2023-06-05 "Protego"
 
 ### `Added`
 

diff --git a/assets/repertoire_comparison.Rmd b/assets/repertoire_comparison.Rmd
@@ -31,6 +31,7 @@ library(alakazam)
 library(shazam)
 library(stringr)
 library(plotly)
+library(airr)
 
 theme_set(theme_bw(base_family = "ArialMT") +
             theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), text = element_text(family="ArialMT")))
@@ -54,21 +55,10 @@ datadir <- "."
 Number of reads for each of the samples and number of sequences left after performing sequence assembly and alignment to reference data.
 The full table can be found under [Table_sequences_assembly](repertoire_comparison/Sequence_numbers_summary/Table_sequences_assembly.tsv).
 
-```{r seq_numbers, echo=FALSE, warning=FALSE, results='asis'}
-read_table <- function(tab_file){
-        tab_seqs <- read.table(tab_file, header=TRUE, sep="\t", check.names = FALSE)
-        write.table(tab_seqs, file=paste0(seq_dir,"/Table_sequences_assembly.tsv"), sep="\t", quote=F, row.names=F)
-    }
-tryCatch( {read_table("./Table_sequences.tsv")} ,
-  error=function(e){message("No sequence numbers are available if starting with assembled reads.")}
-)
-
-```
-
-
 ```{r seq_numbers_plot, echo=FALSE, warning=FALSE, results='asis'}
 tryCatch( {
     tab_seqs <- read.table("./Table_sequences.tsv", header=TRUE, sep="\t", check.names = FALSE)
+    write.table(tab_seqs, file=paste0(seq_dir,"/Table_sequences_assembly.tsv"), sep="\t", quote=F, row.names=F)
 
     plot_table <- tidyr::pivot_longer(tab_seqs,
                                       cols=Sequences_R1:Igblast,
@@ -88,6 +78,8 @@ tryCatch( {
                   theme(axis.text.x= element_text(angle = 45))
 
     ggplotly(seqs_plot)
+
+
   },
   error=function(e){message("No sequence numbers are available if starting with assembled reads.")}
 )
@@ -144,33 +136,37 @@ ggplotly(seqs_plot_assembled)
 # in the current folder
 all_files <- system(paste0("find '", datadir, "' -name '*clone-pass.tsv'"), intern=T)
 
-diversity_dir <- paste(outdir, "Diversity", sep="/")
-abundance_dir <- paste(outdir, "Abundance", sep="/")
 vfamily_dir <- paste(outdir, "V_family", sep="/")
-dir.create(diversity_dir)
-dir.create(abundance_dir)
 dir.create(vfamily_dir)
 
 # Generate one big dataframe from all patient dataframes
+col_select <- c(
+  "sample_id", "subject_id", "sequence_id", "clone_id",
+  "v_call", "d_call", "j_call",
+  "locus",
+  "junction",
+  "pcr_target_locus"
+)
+df_all <- dplyr::bind_rows(lapply(all_files, read_rearrangement, col_select=col_select))
 
-df_list = lapply(all_files, read.csv, sep="\t")
-
-df_all <- dplyr::bind_rows(df_list)
 
 # Remove underscores in these columns
-df_all$subject_id <- sapply(df_all$subject_id, function(x) str_replace(as.character(x), "_", ""))
-df_all$sample_id <- sapply(df_all$sample_id, function(x) str_replace(as.character(x), "_", ""))
+df_all$subject_id <- stringr::str_replace_all(df_all$subject_id, "_", "")
+df_all$sample_id <- stringr::str_replace_all(df_all$sample_id , "_", "")
 
 # Annotate sample and samplepop (sample + population) by add ing all the conditions
 df_all$subj_locus <- as.factor(paste(df_all$sample_id, df_all$subject_id, df_all$pcr_target_locus, sep="_"))
 
-# Write table to file
-write.table(df_all, paste0(outdir,"/all_data.tsv"), sep = "\t", quote=F, row.names = F, col.names = T)
+# Uncomment to save a table with all the sequencess across samples together
+# write.table(df_all, paste0(outdir,"/all_data.tsv"), sep = "\t", quote=F, row.names = F, col.names = T)
 
 # Set number of bootrstraps
-nboot = 200
+nboot <- 200
 ```
 
+
+<!-- Uncomment to include Clonal abundance and clonal diversity in the repertoire comparison report
+
 # Clonal abundance
 
 For plotting the clonal abundance, the clones were ordered by size from bigger clones to smaller clones (x-axis, Rank).
@@ -184,7 +180,15 @@ range of the bootstrap samples.
 
 All clonal abundance plots and tables with abundance values can be found under `repertoire_analysis/Abundance`.
 
-```{r clonal_abundance, echo=FALSE}
+-->
+
+```{r clonal_abundance, echo=FALSE, eval=FALSE}
+# Set line above to eval=TRUE to include clonal abundance
+diversity_dir <- paste(outdir, "Diversity", sep="/")
+abundance_dir <- paste(outdir, "Abundance", sep="/")
+dir.create(diversity_dir)
+dir.create(abundance_dir)
+
 abund <- estimateAbundance(df_all, group = "subj_locus", ci=0.95, nboot=nboot)
 abund@abundance$sample_id <- sapply(abund@abundance$subj_locus, function(x) unlist(strsplit(as.character(x), "_"))[1])
 abund@abundance$subject_id <- sapply(abund@abundance$subj_locus, function(x) unlist(strsplit(as.character(x), "_"))[2])
@@ -208,12 +212,14 @@ p_ca
 
 ```
 
-```{r plot_abundance, include = FALSE}
+```{r plot_abundance, include = FALSE, eval=FALSE}
+# Set to eval=TRUE to include clonal abundance
 ggsave(plot=p_ca, filename = paste0(abundance_dir,"/Clonal_abundance_subject.pdf"), device="pdf", width = 25, height = 10, units="cm")
 ggsave(plot=p_ca, filename = paste0(abundance_dir,"/Clonal_abundance_subject.png"), device="png", width = 25, height = 10, units="cm")
 write.table(abund@abundance, file = paste0(abundance_dir, "/Clonal_abundance_data_subject.tsv"), sep="\t", quote = F, row.names = F)
 ```
 
+<!-- Uncomment to include Clonal diversity and clonal diversity in the repertoire comparison report
 
 # Clonal diversity
 
@@ -252,9 +258,10 @@ To correct for the different number of sequences in each of the samples, the Boo
 in which `r nboot` random bootstrap samples were taken, with size the number of sequences in the sample with less sequences (N).
 The solid line shows the mean Diversity of the bootstrap samples, whereas the transparent area shows the full Diversity
 range of the bootstrap samples.
+-->
 
-
-```{r clonal_diversity, echo = FALSE}
+```{r clonal_diversity, echo = FALSE, eval=FALSE}
+# Set line above to eval=TRUE to include clonal diversity
 sample_div <- alphaDiversity(abund, group="subj_locus", min_q=0, max_q=4, step_q=0.05,
                             ci=0.95, nboot=nboot)
 sample_main <- paste0("Sample diversity (N=", sample_div@n[1], ")")
@@ -273,12 +280,14 @@ div_p <- ggplot(sample_div@diversity, aes(x = q, y = d, group=sample_id)) +
 
 div_p
 ```
-```{r plot_diversity, include = FALSE}
+```{r plot_diversity, include = FALSE, eval=FALSE}
+# Set to eval=TRUE to include clonal diversity
 ggsave(plot=div_p, filename=paste0(diversity_dir,"/Diversity_patient_grid.png"), device="png", width = 25, height = 10, units="cm")
 ggsave(plot=div_p, filename=paste0(diversity_dir,"/Diversity_patient_grid.pdf"), device="pdf", width = 25, height = 10, units="cm")
 write.table(sample_div@diversity, file = paste0(diversity_dir, "/Clonal_diversity_data_subject.tsv"), sep="\t", quote = F, row.names = F)
 ```
 
+
 # V gene usage
 
 ## V gene family usage

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -15,7 +15,8 @@ def parse_args(args=None):
     Epilog = "Example usage: python check_samplesheet.py <FILE_IN>"
 
     parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
-    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("file_in", help="Input samplesheet file.")
+    parser.add_argument("-a", "--assembled", help="Input samplesheet type", action="store_true", default=False)
     return parser.parse_args(args)
 
 
@@ -38,22 +39,22 @@ def print_error(error, context="Line", context_str=""):
     sys.exit(1)
 
 
-def check_samplesheet(file_in):
+def check_samplesheet(file_in, assembled):
     """
     This function checks that the samplesheet:
 
     - contains the compulsory fields: sample_id, filename_R1, filename_R2, subject_id, pcr_target_locus, species, single_cell
     - sample ids are unique
     - samples from the same subject come from the same species
-    - pcr_target_locus is "IG" or "TR"
+    - pcr_target_locus is "IG"/"ig" or "TR"/"tr"
     - species is "human" or "mouse"
     """
 
     sample_run_dict = {}
     with open(file_in, "r") as fin:
-        ## Check that required columns are present
+        # Defining minimum columns and required columns
         min_cols = 7
-        required_columns = [
+        required_columns_raw = [
             "sample_id",
             "filename_R1",
             "filename_R2",
@@ -66,7 +67,19 @@ def check_samplesheet(file_in):
             "biomaterial_provider",
             "age",
         ]
-        no_whitespaces = [
+        required_columns_assembled = [
+            "sample_id",
+            "filename",
+            "subject_id",
+            "species",
+            "pcr_target_locus",
+            "single_cell",
+            "sex",
+            "tissue",
+            "biomaterial_provider",
+            "age",
+        ]
+        no_whitespaces_raw = [
             "sample_id",
             "filename_R1",
             "filename_R2",
@@ -75,13 +88,52 @@ def check_samplesheet(file_in):
             "pcr_target_locus",
             "tissue",
         ]
+        no_whitespaces_assembled = [
+            "sample_id",
+            "filename",
+            "subject_id",
+            "species",
+            "pcr_target_locus",
+            "tissue",
+        ]
+
+        ## Read header
         header = [x.strip('"') for x in fin.readline().strip().split("\t")]
-        for col in required_columns:
-            if col not in header:
-                print("ERROR: Please check samplesheet header: {} ".format(",".join(header)))
-                print("Header is missing column {}".format(col))
-                print("Header must contain columns {}".format("\t".join(required_columns)))
-                raise IndexError("Header must contain columns {}".format("\t".join(required_columns)))
+        ## Read tab
+        tab = pd.read_csv(file_in, sep="\t", header=0)
+
+        # Check that all required columns for assembled and raw samplesheets are there, and do not contain whitespaces
+        if assembled:
+            for col in required_columns_assembled:
+                if col not in header:
+                    print("ERROR: Please check samplesheet header: {} ".format(",".join(header)))
+                    print("Header is missing column {}".format(col))
+                    print("Header must contain columns {}".format("\t".join(required_columns)))
+                    raise IndexError("Header must contain columns {}".format("\t".join(required_columns)))
+            for col in no_whitespaces_assembled:
+                values = tab[col].tolist()
+                if any([re.search(r"\s+", s) for s in values]):
+                    print_error(
+                        "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
+                            col, no_whitespaces_assembled
+                        )
+                    )
+
+        else:
+            for col in required_columns_raw:
+                if col not in header:
+                    print("ERROR: Please check samplesheet header: {} ".format(",".join(header)))
+                    print("Header is missing column {}".format(col))
+                    print("Header must contain columns {}".format("\t".join(required_columns)))
+                    raise IndexError("Header must contain columns {}".format("\t".join(required_columns)))
+            for col in no_whitespaces_raw:
+                values = tab[col].tolist()
+                if any([re.search(r"\s+", s) for s in values]):
+                    print_error(
+                        "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
+                            col, no_whitespaces_raw
+                        )
+                    )
 
         ## Check that rows have the same fields as header, and at least the compulsory ones are provided
         for line_num, line in enumerate(fin):
@@ -103,15 +155,14 @@ def check_samplesheet(file_in):
                 )
 
         ## Check that sample ids are unique
-        tab = pd.read_csv(file_in, sep="\t", header=0)
         if len(tab["sample_id"]) != len(set(tab["sample_id"])):
             print_error(
                 "Sample IDs are not unique! The sample IDs in the input samplesheet should be unique for each sample."
             )
 
         ## Check that pcr_target_locus is IG or TR
         for val in tab["pcr_target_locus"]:
-            if val not in ["IG", "TR"]:
+            if val.upper() not in ["IG", "TR"]:
                 print_error("pcr_target_locus must be one of: IG, TR.")
 
         ## Check that species is human or mouse
@@ -129,20 +180,10 @@ def check_samplesheet(file_in):
                     "The same subject_id cannot belong to different species! Check input file columns 'subject_id' and 'species'."
                 )
 
-        ## Check that values do not contain spaces in the no whitespaces columns
-        for col in no_whitespaces:
-            values = tab[col].tolist()
-            if any([re.search(r"\s+", s) for s in values]):
-                print_error(
-                    "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
-                        col, no_whitespaces
-                    )
-                )
-
 
 def main(args=None):
     args = parse_args(args)
-    check_samplesheet(args.FILE_IN)
+    check_samplesheet(args.file_in, args.assembled)
 
 
 if __name__ == "__main__":

diff --git a/bin/reveal_filter_quality.R b/bin/reveal_filter_quality.R
@@ -89,12 +89,16 @@ if (!is.null(opt$OUTPUT)) {
 } else {
     output_fn <- sub(".tsv$", "_quality-pass.tsv", basename(opt$REPERTOIRE))
 }
-write_rearrangement(db[filter_pass, ], file = output_fn)
+# don't write if empty
+if (sum(filter_pass)>0) {
+    write_rearrangement(db[filter_pass, ], file = output_fn)
+}
 
 # cat("     TOTAL_GROUPS> ", n_groups,  "\n", sep=" ", file = file.path(out_dir, log_verbose_name), append=TRUE)
 
 write("START> FilterQuality", stdout())
 write(paste0("FILE> ", basename(opt$REPERTOIRE)), stdout())
+# even if output file not written, because empty, keep track in log
 write(paste0("OUTPUT> ", basename(output_fn)), stdout())
 write(paste0("PASS> ", sum(filter_pass)), stdout())
 write(paste0("FAIL> ", sum(!filter_pass) + sum(filter_na)), stdout())