Skip to content

Commit

Permalink
Merge pull request #19 from phac-nml/update/input_assure
Browse files Browse the repository at this point in the history
Update input_assure.py to support updates in mlst.json allele reports with the introduction of locidex:0.2.3
  • Loading branch information
kylacochrane authored Aug 23, 2024
2 parents 378310e + 288ce4d commit da39d65
Show file tree
Hide file tree
Showing 17 changed files with 206 additions and 120 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.2.0] - 2024-08-23

### Changed

- Upgraded `locidex/merge` to version `0.2.3` and updated `input_assure.py` and test data for compatibility with the new `mlst.json` allele file format.
- [PR19](https://github.com/phac-nml/arboratornf/pull/19)

## [0.1.0] - 2024-08-20

Initial release of the arboratornf pipeline to be used for running [Arborator](https://github.com/phac-nml/arborator) under Nextflow.
Expand All @@ -15,3 +22,4 @@ Initial release of the arboratornf pipeline to be used for running [Arborator](h
- ArborView integration.

[0.1.0]: https://github.com/phac-nml/arboratornf/releases/tag/0.1.0
[0.2.0]: https://github.com/phac-nml/arboratornf/releases/tag/0.2.0
12 changes: 6 additions & 6 deletions assets/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
sample,mlst_alleles,metadata_partition,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
S1,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S1.mlst.json,1,"Escherichia coli","EHEC/STEC","Canada","O157:H7",21,"2024/05/30","beef",true
S2,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S2.mlst.json,1,"Escherichia coli","EHEC/STEC","The United States","O157:H7",55,"2024/05/21","milk",false
S3,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S3.mlst.json,2,"Escherichia coli","EPEC","France","O125",14,"2024/04/30","cheese",true
S4,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S4.mlst.json,2,"Escherichia coli","EPEC","France","O125",35,"2024/04/22","cheese",true
S5,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S5.mlst.json,3,"Escherichia coli","EAEC","Canada","O126:H27",61,"2012/09/01","milk",false
S6,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S6.mlst.json,unassociated,"Escherichia coli","EAEC","Canada","O111:H21",43,"2011/12/25","fruit",false
S1,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S1.mlst.json,1,"Escherichia coli","EHEC/STEC","Canada","O157:H7",21,"2024/05/30","beef",true
S2,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S2.mlst.json,1,"Escherichia coli","EHEC/STEC","The United States","O157:H7",55,"2024/05/21","milk",false
S3,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S3.mlst.json,2,"Escherichia coli","EPEC","France","O125",14,"2024/04/30","cheese",true
S4,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S4.mlst.json,2,"Escherichia coli","EPEC","France","O125",35,"2024/04/22","cheese",true
S5,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S5.mlst.json,3,"Escherichia coli","EAEC","Canada","O126:H27",61,"2012/09/01","milk",false
S6,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S6.mlst.json,unassociated,"Escherichia coli","EAEC","Canada","O111:H21",43,"2011/12/25","fruit",false
45 changes: 19 additions & 26 deletions bin/input_assure.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,38 +19,41 @@ def check_inputs(json_file, sample_id, address, output_error_file, output_json_f
with open_file(json_file, "rt") as f:
json_data = json.load(f)

# Extract the profile from the json_data
profile = json_data.get("data", {}).get("profile", {})
# Check for multiple keys in the JSON file and define error message
keys = sorted(profile.keys())
original_key = keys[0] if keys else None

# Define a variable to store the match_status (True or False)
match_status = sample_id in json_data
match_status = sample_id in profile

# Initialize the error message
error_message = None

# Check for multiple keys in the JSON file and define error message
keys = list(json_data.keys())
original_key = keys[0] if keys else None

if len(keys) == 0:
error_message = f"{json_file} is completely empty!"
if not keys:
error_message = f"{json_file} is missing the 'profile' section or is completely empty!"
print(error_message)
sys.exit(1)
elif len(keys) > 1:
# Check if sample_id matches any key
if not match_status:
error_message = f"No key in the MLST JSON file ({json_file}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed."
# Retain only the specified sample ID
json_data = {sample_id: json_data.pop(original_key)}
json_data["data"]["profile"] = {sample_id: profile.pop(original_key)}
else:
error_message = f"MLST JSON file ({json_file}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry"
# Remove all keys expect the one matching sample_id
json_data = {sample_id: json_data[sample_id]}
# Retain only the specified sample_id in the profile
json_data["data"]["profile"] = {sample_id: profile[sample_id]}
elif not match_status:
# Define error message based on meta.address (query or reference)
if address == "null":
error_message = f"Query {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
else:
error_message = f"Reference {sample_id} ID and JSON key in {json_file} DO NOT MATCH. The '{original_key}' key in {json_file} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
# Update the JSON file with the new sample ID
json_data[sample_id] = json_data.pop(original_key)
json_data["data"]["profile"] = {sample_id: profile.pop(original_key)}
json_data["data"]["sample_name"] = sample_id

# Write file containing relevant error messages
if error_message:
Expand All @@ -69,21 +72,11 @@ def check_inputs(json_file, sample_id, address, output_error_file, output_json_f
description="Check sample inputs, force change if ID ≠ KEY, and generate an error report."
)
parser.add_argument("--input", help="Path to the mlst.json file.", required=True)
parser.add_argument(
"--sample_id", help="Sample ID to check in the JSON file.", required=True
)
parser.add_argument(
"--address", help="Address to use in the error message.", required=True
)
parser.add_argument(
"--output_error", help="Path to the error report file.", required=True
)
parser.add_argument(
"--output_json", help="Path to the MLST JSON file (gzipped).", required=True
)
parser.add_argument("--sample_id", help="Sample ID to check in the JSON file.", required=True)
parser.add_argument("--address", help="Address to use in the error message.", required=True)
parser.add_argument("--output_error", help="Path to the error report file.", required=True)
parser.add_argument("--output_json", help="Path to the MLST JSON file (gzipped).", required=True)

args = parser.parse_args()

check_inputs(
args.input, args.sample_id, args.address, args.output_error, args.output_json
)
check_inputs(args.input, args.sample_id, args.address, args.output_error, args.output_json)
2 changes: 1 addition & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ params {
max_time = '1.h'

// Input data
input = 'https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/samplesheets/samplesheet.csv'
input = "${projectDir}/tests/data/samplesheets/samplesheet.csv"

outdir = "results"

Expand Down
5 changes: 3 additions & 2 deletions modules/local/locidex/merge/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ process LOCIDEX_MERGE {
label 'process_medium'

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/locidex:0.1.1--pyhdfd78af_0' :
'biocontainers/locidex:0.1.1--pyhdfd78af_0' }"
"docker.io/mwells14/locidex:0.2.3" :
task.ext.override_configured_container_registry != false ? 'docker.io/mwells14/locidex:0.2.3' :
'mwells14/locidex:0.2.3' }"

input:
path input_values // [file(sample1), file(sample2), file(sample3), etc...]
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ manifest {
description = """Arborator: Genomic Profile Clustering and Summary"""
mainScript = 'main.nf'
nextflowVersion = '!>=23.04.0'
version = '0.1.0'
version = '0.2.0'
doi = ''
defaultBranch = 'main'
}
Expand Down
32 changes: 23 additions & 9 deletions tests/data/profiles/S1.mlst.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
{
"S1": {
"locus_1": 1,
"locus_2": 1,
"locus_3": "1",
"locus_4": "1",
"locus_5": "1",
"locus_6": 1,
"locus_7": 1
"db_info": {},
"parameters": {
"mode": "normal",
"min_match_ident": 100,
"min_match_cov": 100,
"max_ambiguous": 0,
"max_internal_stops": 0
},
"data": {
"sample_name": "S1",
"profile": {
"S1": {
"locus_1": 1,
"locus_2": 1,
"locus_3": "1",
"locus_4": "1",
"locus_5": "1",
"locus_6": 1,
"locus_7": 1
}
},
"seq_data": {}
}
}
}
32 changes: 23 additions & 9 deletions tests/data/profiles/S2.mlst.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
{
"S2": {
"locus_1": 1,
"locus_2": 1,
"locus_3": "2",
"locus_4": "2",
"locus_5": "?",
"locus_6": 4,
"locus_7": 1
"db_info": {},
"parameters": {
"mode": "normal",
"min_match_ident": 100,
"min_match_cov": 100,
"max_ambiguous": 0,
"max_internal_stops": 0
},
"data": {
"sample_name": "S2",
"profile": {
"S2": {
"locus_1": 1,
"locus_2": 1,
"locus_3": "2",
"locus_4": "2",
"locus_5": "?",
"locus_6": 4,
"locus_7": 1
}
},
"seq_data": {}
}
}
}
32 changes: 23 additions & 9 deletions tests/data/profiles/S3.mlst.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
{
"S3": {
"locus_1": 1,
"locus_2": 2,
"locus_3": "2",
"locus_4": "2",
"locus_5": "1",
"locus_6": 5,
"locus_7": 1
"db_info": {},
"parameters": {
"mode": "normal",
"min_match_ident": 100,
"min_match_cov": 100,
"max_ambiguous": 0,
"max_internal_stops": 0
},
"data": {
"sample_name": "S3",
"profile": {
"S3": {
"locus_1": 1,
"locus_2": 2,
"locus_3": "2",
"locus_4": "2",
"locus_5": "1",
"locus_6": 5,
"locus_7": 1
}
},
"seq_data": {}
}
}
}
32 changes: 23 additions & 9 deletions tests/data/profiles/S4.mlst.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
{
"S4": {
"locus_1": 1,
"locus_2": 2,
"locus_3": "3",
"locus_4": "2",
"locus_5": "1",
"locus_6": 6,
"locus_7": 1
"db_info": {},
"parameters": {
"mode": "normal",
"min_match_ident": 100,
"min_match_cov": 100,
"max_ambiguous": 0,
"max_internal_stops": 0
},
"data": {
"sample_name": "S4",
"profile": {
"S4": {
"locus_1": 1,
"locus_2": 2,
"locus_3": "3",
"locus_4": "2",
"locus_5": "1",
"locus_6": 6,
"locus_7": 1
}
},
"seq_data": {}
}
}
}
32 changes: 23 additions & 9 deletions tests/data/profiles/S5.mlst.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
{
"S5": {
"locus_1": 1,
"locus_2": 2,
"locus_3": "?",
"locus_4": "2",
"locus_5": "1",
"locus_6": 8,
"locus_7": 1
"db_info": {},
"parameters": {
"mode": "normal",
"min_match_ident": 100,
"min_match_cov": 100,
"max_ambiguous": 0,
"max_internal_stops": 0
},
"data": {
"sample_name": "S5",
"profile": {
"S5": {
"locus_1": 1,
"locus_2": 2,
"locus_3": "?",
"locus_4": "2",
"locus_5": "1",
"locus_6": 8,
"locus_7": 1
}
},
"seq_data": {}
}
}
}
32 changes: 23 additions & 9 deletions tests/data/profiles/S6.mlst.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
{
"S6": {
"locus_1": 2,
"locus_2": 3,
"locus_3": "3",
"locus_4": "-",
"locus_5": "?",
"locus_6": 9,
"locus_7": 0
"db_info": {},
"parameters": {
"mode": "normal",
"min_match_ident": 100,
"min_match_cov": 100,
"max_ambiguous": 0,
"max_internal_stops": 0
},
"data": {
"sample_name": "S6",
"profile": {
"S6": {
"locus_1": 2,
"locus_2": 3,
"locus_3": "3",
"locus_4": "-",
"locus_5": "?",
"locus_6": 9,
"locus_7": 0
}
},
"seq_data": {}
}
}
}
12 changes: 6 additions & 6 deletions tests/data/samplesheets/samplesheet-bad-metadata_1.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
sample,mlst_alleles,metadata_partition,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
S1,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S1.mlst.json,1,"Escherichia coli|","EHEC/STEC","Canada","O157:H7",21,"2024/05/30","beef",true
S2,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S2.mlst.json,1,"Escherichia coli","EHEC/STEC","The United States","O157:H7",55,"2024/05/21","milk",false
S3,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S3.mlst.json,2,"Escherichia coli","EPEC","France","O125",14,"2024/04/30","cheese",true
S4,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S4.mlst.json,2,"Escherichia coli","EPEC","France","O125",35,"2024/04/22","cheese",true
S5,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S5.mlst.json,3,"Escherichia coli","EAEC","Canada","O126:H27",61,"2012/09/01","milk",false
S6,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S6.mlst.json,unassociated,"Escherichia coli","EAEC","Canada","O111:H21",43,"2011/12/25","fruit",false
S1,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S1.mlst.json,1,"Escherichia coli|","EHEC/STEC","Canada","O157:H7",21,"2024/05/30","beef",true
S2,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S2.mlst.json,1,"Escherichia coli","EHEC/STEC","The United States","O157:H7",55,"2024/05/21","milk",false
S3,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S3.mlst.json,2,"Escherichia coli","EPEC","France","O125",14,"2024/04/30","cheese",true
S4,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S4.mlst.json,2,"Escherichia coli","EPEC","France","O125",35,"2024/04/22","cheese",true
S5,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S5.mlst.json,3,"Escherichia coli","EAEC","Canada","O126:H27",61,"2012/09/01","milk",false
S6,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S6.mlst.json,unassociated,"Escherichia coli","EAEC","Canada","O111:H21",43,"2011/12/25","fruit",false
12 changes: 6 additions & 6 deletions tests/data/samplesheets/samplesheet-bad-metadata_partition.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
sample,mlst_alleles,metadata_partition,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8
S1,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S1.mlst.json,1@,"Escherichia coli","EHEC/STEC","Canada","O157:H7",21,"2024/05/30","beef",true
S2,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S2.mlst.json,1,"Escherichia coli","EHEC/STEC","The United States","O157:H7",55,"2024/05/21","milk",false
S3,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S3.mlst.json,2,"Escherichia coli","EPEC","France","O125",14,"2024/04/30","cheese",true
S4,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S4.mlst.json,2,"Escherichia coli","EPEC","France","O125",35,"2024/04/22","cheese",true
S5,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S5.mlst.json,3,"Escherichia coli","EAEC","Canada","O126:H27",61,"2012/09/01","milk",false
S6,https://raw.githubusercontent.com/phac-nml/arboratornf/dev/tests/data/profiles/S6.mlst.json,unassociated,"Escherichia coli","EAEC","Canada","O111:H21",43,"2011/12/25","fruit",false
S1,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S1.mlst.json,1@,"Escherichia coli","EHEC/STEC","Canada","O157:H7",21,"2024/05/30","beef",true
S2,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S2.mlst.json,1,"Escherichia coli","EHEC/STEC","The United States","O157:H7",55,"2024/05/21","milk",false
S3,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S3.mlst.json,2,"Escherichia coli","EPEC","France","O125",14,"2024/04/30","cheese",true
S4,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S4.mlst.json,2,"Escherichia coli","EPEC","France","O125",35,"2024/04/22","cheese",true
S5,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S5.mlst.json,3,"Escherichia coli","EAEC","Canada","O126:H27",61,"2012/09/01","milk",false
S6,https://raw.githubusercontent.com/phac-nml/arboratornf/update/input_assure/tests/data/profiles/S6.mlst.json,unassociated,"Escherichia coli","EAEC","Canada","O111:H21",43,"2011/12/25","fruit",false
Loading

0 comments on commit da39d65

Please sign in to comment.