phac-nml · emarinier · Apr 29, 2024 · Apr 12, 2024 · Apr 15, 2024 · Apr 16, 2024
diff --git a/.editorconfig b/.editorconfig
@@ -26,6 +26,12 @@ indent_size = unset
 [/assets/ArborView.html]
 trim_trailing_whitespace = unset
 
+[tests/data/append/expected_clusters_and_metadata_little_metadata.tsv]
+trim_trailing_whitespace = unset
+
+[tests/data/append/expected_clusters_and_metadata_no_metadata.tsv]
+trim_trailing_whitespace = unset
+
 # ignore Readme
 [README.md]
 indent_style = unset

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,25 +5,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## In-development
 
-- Fixed nf-core tools linting failures introduced in version 2.12.1.
-- Added phac-nml prefix to nf-core config
-
-## 1.0.3 - 2024/02/23
-
-- Pinned [email protected] plugin
-
-## 1.0.2 - 2023/12/18
-
-- Removed GitHub workflows that weren't needed.
-- Adding additional parameters for testing purposes.
-
-## 1.0.1 - 2023/12/06
-
-Allowing non-gzipped FASTQ files as input. Default branch is now main.
-
-## 1.0.0 - 2023/11/30
-
-Initial release of phac-nml/gasclustering, created with the [nf-core](https://nf-co.re/) template.
+- Initial release of phac-nml/gasclustering.
+- Added support for metadata.
 
 ### `Added`
 

diff --git a/README.md b/README.md
@@ -1,23 +1,29 @@
 [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A523.04.3-brightgreen.svg)](https://www.nextflow.io/)
 
-# Example Pipeline for IRIDA Next
+# Genomic Address Service Clustering Workflow
 
-This is an example pipeline to be used for integration with IRIDA Next.
+This workflow takes provided JSON-formatted MLST profiles and converts them into a phylogenetic tree with associated flat cluster codes for use in [Irida Next](https://github.com/phac-nml/irida-next). The workflow also generates an interactive tree for visualization.
 
 # Input
 
 The input to the pipeline is a standard sample sheet (passed as `--input samplesheet.csv`) that looks like:
 
-| sample  | fastq_1         | fastq_2         |
-| ------- | --------------- | --------------- |
-| SampleA | file_1.fastq.gz | file_2.fastq.gz |
+| sample  | mlst_alleles      | metadata_1 | metadata_2 | metadata_3 | metadata_4 | metadata_5 | metadata_6 | metadata_7 | metadata_8 |
+| ------- | ----------------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- |
+| SampleA | sampleA.mlst.json | meta1      | meta2      | meta3      | meta4      | meta5      | meta6      | meta7      | meta8      |
 
 The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). Validation of the sample sheet is performed by [nf-validation](https://nextflow-io.github.io/nf-validation/).
 
 # Parameters
 
 The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers and `-r [branch]` to specify which GitHub branch you would like to run.
 
+## Metadata
+
+In order to customize metadata headers, the parameters `--metadata_1_header` through `--metadata_8_header` may be specified.
+
+## Other
+
 Other parameters (defaults from nf-core) are defined in [nextflow_schema.json](nextflow_schmea.json).
 
 # Running
@@ -41,41 +47,46 @@ An example of the what the contents of the IRIDA Next JSON file looks like for t
     "files": {
         "global": [
             {
-                "path": "summary/summary.txt.gz"
+                "path": "ArborView/clustered_data_arborview.html"
+            },
+            {
+                "path": "clusters/run.json"
+            },
+            {
+                "path": "clusters/tree.nwk"
+            },
+            {
+                "path": "clusters/clusters.text"
+            },
+            {
+                "path": "clusters/thresholds.json"
+            },
+            {
+                "path": "distances/run.json"
+            },
+            {
+                "path": "distances/results.text"
+            },
+            {
+                "path": "distances/ref_profile.text"
+            },
+            {
+                "path": "distances/query_profile.text"
+            },
+            {
+                "path": "distances/allele_map.json"
+            },
+            {
+                "path": "merged/profile.tsv"
             }
         ],
         "samples": {
-            "SAMPLE1": [
-                {
-                    "path": "assembly/SAMPLE1.assembly.fa.gz"
-                }
-            ],
-            "SAMPLE2": [
-                {
-                    "path": "assembly/SAMPLE2.assembly.fa.gz"
-                }
-            ],
-            "SAMPLE3": [
-                {
-                    "path": "assembly/SAMPLE3.assembly.fa.gz"
-                }
-            ]
+
         }
     },
     "metadata": {
         "samples": {
-            "SAMPLE1": {
-                "reads.1": "sample1_R1.fastq.gz",
-                "reads.2": "sample1_R2.fastq.gz"
-            },
-            "SAMPLE2": {
-                "reads.1": "sample2_R1.fastq.gz",
-                "reads.2": "sample2_R2.fastq.gz"
-            },
-            "SAMPLE3": {
-                "reads.1": "sample1_R1.fastq.gz",
-                "reads.2": "null"
-            }
+
         }
     }
 }
@@ -95,7 +106,7 @@ nextflow run phac-nml/gasclustering -profile docker,test -r main -latest --outdi
 
 # Legal
 
-Copyright 2023 Government of Canada
+Copyright 2024 Government of Canada
 
 Licensed under the MIT License (the "License"); you may not use
 this work except in compliance with the License. You may obtain a copy of the

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -19,6 +19,62 @@
                 "format": "file-path",
                 "pattern": "^\\S+\\.mlst\\.json(\\.gz)?$",
                 "errorMessage": "MLST JSON file from locidex report, cannot contain spaces and must have the extension: '.mlst.json' or '.mlst.json.gz'"
+            },
+            "metadata_1": {
+                "type": "string",
+                "meta": ["metadata_1"],
+                "errorMessage": "Metadata associated with the sample (metadata_1).",
+                "default": "",
+                "pattern": "^[^\\n\\t\"]+$"
+            },
+            "metadata_2": {
+                "type": "string",
+                "meta": ["metadata_2"],
+                "errorMessage": "Metadata associated with the sample (metadata_2).",
+                "default": "",
+                "pattern": "^[^\\n\\t\"]+$"
+            },
+            "metadata_3": {
+                "type": "string",
+                "meta": ["metadata_3"],
+                "errorMessage": "Metadata associated with the sample (metadata_3).",
+                "default": "",
+                "pattern": "^[^\\n\\t\"]+$"
+            },
+            "metadata_4": {
+                "type": "string",
+                "meta": ["metadata_4"],
+                "errorMessage": "Metadata associated with the sample (metadata_4).",
+                "default": "",
+                "pattern": "^[^\\n\\t\"]+$"
+            },
+            "metadata_5": {
+                "type": "string",
+                "meta": ["metadata_5"],
+                "errorMessage": "Metadata associated with the sample (metadata_5).",
+                "default": "",
+                "pattern": "^[^\\n\\t\"]+$"
+            },
+            "metadata_6": {
+                "type": "string",
+                "meta": ["metadata_6"],
+                "errorMessage": "Metadata associated with the sample (metadata_6).",
+                "default": "",
+                "pattern": "^[^\\n\\t\"]+$"
+            },
+            "metadata_7": {
+                "type": "string",
+                "meta": ["metadata_7"],
+                "errorMessage": "Metadata associated with the sample (metadata_7).",
+                "default": "",
+                "pattern": "^[^\\n\\t\"]+$"
+            },
+            "metadata_8": {
+                "type": "string",
+                "meta": ["metadata_8"],
+                "errorMessage": "Metadata associated with the sample (metadata_8).",
+                "default": "",
+                "pattern": "^[^\\n\\t\"]+$"
             }
         },
         "required": ["sample", "mlst_alleles"]

diff --git a/modules/local/appendmetadata/main.nf b/modules/local/appendmetadata/main.nf
@@ -0,0 +1,56 @@
+process APPEND_METADATA {
+    tag "append_metadata"
+    label 'process_single'
+
+    input:
+    val clusters_path  // cluster data as a TSV path
+                        // this needs to be "val", because "path"
+                        // won't stage the file correctly for exec
+    val metadata_rows  // metadata rows (no headers) to be appened, list of lists
+    val metadata_headers  // headers to name the metadata columns
+
+    output:
+    path("clusters_and_metadata.tsv"), emit: clusters
+
+    exec:
+    def clusters_rows  // has a header row
+    def clusters_rows_map = [:]
+    def metadata_rows_map = [:]
+    def merged = []
+
+    clusters_path.withReader { reader ->
+        clusters_rows = reader.readLines()*.split('\t')
+    }
+
+    // Create a map of the cluster rows:
+    // Start on i = 1 because we don't want the headers.
+    for(int i = 1; i < clusters_rows.size(); i++)
+    {
+        // "sample" -> ["sample", 1, 2, 3, ...]
+        clusters_rows_map[clusters_rows[i][0]] = clusters_rows[i]
+    }
+
+    // Create a map of the metadata rows:
+    // Start on i = 0 because there are no headers included.
+    for(int i = 0; i < metadata_rows.size(); i++)
+    {
+        // "sample" -> ["sample", meta1, meta2, meta3, ...]
+        metadata_rows_map[metadata_rows[i][0]] = metadata_rows[i]
+    }
+
+    // Merge the headers
+    merged.add(clusters_rows[0] + metadata_headers)
+
+    // Merge the remain rows in original order:
+    // Start on i = 1 because we don't want the headers.
+    for(int i = 1; i < clusters_rows.size(); i++)
+    {
+        def sample_key = clusters_rows[i][0]
+        merged.add(clusters_rows_map[sample_key] + metadata_rows_map[sample_key][1..-1])
+    }
+
+    task.workDir.resolve("clusters_and_metadata.tsv").withWriter { writer ->
+        merged.each { writer.writeLine it.join("\t") }
+    }
+
+}
diff --git a/nextflow.config b/nextflow.config
@@ -63,6 +63,16 @@ params {
     // Arborview specific data
     // TODO check this works in azure
     av_html = "./assets/ArborView.html"
+
+    // Metadata
+    metadata_1_header = "metadata_1"
+    metadata_2_header = "metadata_2"
+    metadata_3_header = "metadata_3"
+    metadata_4_header = "metadata_4"
+    metadata_5_header = "metadata_5"
+    metadata_6_header = "metadata_6"
+    metadata_7_header = "metadata_7"
+    metadata_8_header = "metadata_8"
 }
 
 // Load base.config by default for all pipelines

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -39,6 +39,71 @@
                 }
             }
         },
+        "metadata": {
+            "title": "Metadata",
+            "type": "object",
+            "description": "The column header names of the metadata columns.",
+            "default": "",
+            "properties": {
+                "metadata_1_header": {
+                    "type": "string",
+                    "default": "metadata_1",
+                    "description": "The header name of metadata column 1.",
+                    "fa_icon": "far fa-sticky-note",
+                    "pattern": "^[^\\n\\t\"]+$"
+                },
+                "metadata_2_header": {
+                    "type": "string",
+                    "default": "metadata_2",
+                    "description": "The header name of metadata column 2.",
+                    "fa_icon": "far fa-sticky-note",
+                    "pattern": "^[^\\n\\t\"]+$"
+                },
+                "metadata_3_header": {
+                    "type": "string",
+                    "default": "metadata_3",
+                    "description": "The header name of metadata column 3.",
+                    "fa_icon": "far fa-sticky-note",
+                    "pattern": "^[^\\n\\t\"]+$"
+                },
+                "metadata_4_header": {
+                    "type": "string",
+                    "default": "metadata_4",
+                    "description": "The header name of metadata column 4.",
+                    "fa_icon": "far fa-sticky-note",
+                    "pattern": "^[^\\n\\t\"]+$"
+                },
+                "metadata_5_header": {
+                    "type": "string",
+                    "default": "metadata_5",
+                    "description": "The header name of metadata column 5.",
+                    "fa_icon": "far fa-sticky-note",
+                    "pattern": "^[^\\n\\t\"]+$"
+                },
+                "metadata_6_header": {
+                    "type": "string",
+                    "default": "metadata_6",
+                    "description": "The header name of metadata column 6.",
+                    "fa_icon": "far fa-sticky-note",
+                    "pattern": "^[^\\n\\t\"]+$"
+                },
+                "metadata_7_header": {
+                    "type": "string",
+                    "default": "metadata_7",
+                    "description": "The header name of metadata column 7.",
+                    "fa_icon": "far fa-sticky-note",
+                    "pattern": "^[^\\n\\t\"]+$"
+                },
+                "metadata_8_header": {
+                    "type": "string",
+                    "default": "metadata_8",
+                    "description": "The header name of metadata column 8.",
+                    "fa_icon": "far fa-sticky-note",
+                    "pattern": "^[^\\n\\t\"]+$"
+                }
+            },
+            "fa_icon": "far fa-clipboard"
+        },
         "profile_dists": {
             "title": "Profile Dists",
             "type": "object",
@@ -280,6 +345,9 @@
         {
             "$ref": "#/definitions/input_output_options"
         },
+        {
+            "$ref": "#/definitions/metadata"
+        },
         {
             "$ref": "#/definitions/profile_dists"
         },

diff --git a/tests/data/append/expected_clusters_and_metadata.tsv b/tests/data/append/expected_clusters_and_metadata.tsv
@@ -0,0 +1,4 @@
+id	address	level_1	level_2	level_3	myheader_1	myheader_2	myheader_3	myheader_4	myheader_5	myheader_6	myheader_7	myheader_8
+sample1	1.1.1	1	1	1	1.1	1.2	1.3	1.4	1.5	1.6	1.7	1.8
+sample2	1.1.1	1	1	1	2.1	2.2	2.3	2.4	2.5	2.6	2.7	2.8
+sample3	2.2.2	2	2	2	3.1	3.2	3.3	3.4	3.5	3.6	3.7	3.8
diff --git a/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv b/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv
@@ -0,0 +1,4 @@
+id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
+sample1	1.1.1	1	1	1				1.4				
+sample2	1.1.1	1	1	1								
+sample3	2.2.2	2	2	2	3.1	3.2						3.8
diff --git a/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv b/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv
@@ -0,0 +1,4 @@
+id	address	level_1	level_2	level_3	metadata_1	metadata_2	metadata_3	metadata_4	metadata_5	metadata_6	metadata_7	metadata_8
+sample1	1.1.1	1	1	1								
+sample2	1.1.1	1	1	1								
+sample3	2.2.2	2	2	2								
diff --git a/tests/data/samplesheets/samplesheet-little-metadata.csv b/tests/data/samplesheets/samplesheet-little-metadata.csv
@@ -0,0 +1,4 @@
+sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
+sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,,,,1.4,,,,
+sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,,,,,,,,
+sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,,,,,,3.8