From 1e4110b55c40752d89d9ba7521366a024efca405 Mon Sep 17 00:00:00 2001 From: tcezard Date: Wed, 31 Jul 2024 11:38:58 +0100 Subject: [PATCH] Script to go through multiple ELOADs and report the naming convention found --- tasks/eva_3632/gather_naming_conventions.py | 99 +++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100755 tasks/eva_3632/gather_naming_conventions.py diff --git a/tasks/eva_3632/gather_naming_conventions.py b/tasks/eva_3632/gather_naming_conventions.py new file mode 100755 index 0000000..0c3a0b2 --- /dev/null +++ b/tasks/eva_3632/gather_naming_conventions.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +# Copyright 2020 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from argparse import ArgumentParser + +from ebi_eva_common_pyutils.logger import logging_config as log_cfg + +from eva_submission.eload_submission import Eload +from eva_submission.submission_config import load_config + +logger = log_cfg.get_logger(__name__) + + +def report_per_files(eloads): + for eload in eloads: + eload_obj = Eload(eload_number=eload) + vcf_files_2_naming_conv = eload_obj.eload_cfg.query('validation', 'naming_convention_check', 'files') + if vcf_files_2_naming_conv: + for vcf_file in vcf_files_2_naming_conv: + if vcf_files_2_naming_conv[vcf_file]['naming_convention']: + print(f"ELOAD_{eload}: {vcf_file} {vcf_files_2_naming_conv[vcf_file]['naming_convention']}") + elif vcf_files_2_naming_conv[vcf_file]['naming_convention_map']: + nc_map = vcf_files_2_naming_conv[vcf_file]['naming_convention_map'] + if 'Not found' in nc_map: + print(f"ELOAD_{eload}: {vcf_file} Missing {len(nc_map['Not found'])} chromosome") + else: + print(f"ELOAD_{eload}: {vcf_file} Found {len(nc_map)} naming conventions") + else: + print(f"ELOAD_{eload}: {vcf_file} naming conventions not assessed") + + +def report_per_eload(eloads): + for eload in eloads: + eload_obj = Eload(eload_number=eload) + vcf_files_2_naming_conv = eload_obj.eload_cfg.query('validation', 'naming_convention_check', 'files') + naming_conv = eload_obj.eload_cfg.query('validation', 'naming_convention_check', 'naming_convention') + if naming_conv: + pass + elif vcf_files_2_naming_conv: + for vcf_file in vcf_files_2_naming_conv: + file_naming_convention = vcf_files_2_naming_conv[vcf_file]['naming_convention'] + if file_naming_convention and naming_conv not in ['Multiple', 'Not calculated', 'Not found']: + # If we get here it means that different files use different naming convention + naming_conv = 'Different per files' + nc_map = vcf_files_2_naming_conv[vcf_file]['naming_convention_map'] + if vcf_files_2_naming_conv[vcf_file]['naming_convention']: + pass + elif vcf_files_2_naming_conv[vcf_file]['naming_convention_map']: + if naming_conv not in ['Not calculated'] and 'Not found' in nc_map: + naming_conv = 'Not found' + elif naming_conv not in ['Not calculated', 'Not found']: + naming_conv = 'Multiple' + else: + naming_conv = 'Not calculated' + else: + naming_conv = 'Not calculated' + + print(f"ELOAD_{eload}: {naming_conv}") + + +def main(): + argparse = ArgumentParser(description='Validate an ELOAD by checking the data and metadata format and semantics.') + argparse.add_argument('--eloads', required=True, type=int, nargs='+', + help='The ELOAD numbers of the submissions to summarise') + argparse.add_argument('--per_files', action='store_true', default=False, + help='Report the naming convention per file rather than per eload') + argparse.add_argument('--debug', action='store_true', default=False, + help='Set the script to output logging information at debug level') + + args = argparse.parse_args() + + log_cfg.add_stdout_handler() + if args.debug: + log_cfg.set_log_level(logging.DEBUG) + + # Load the config_file from default location + load_config() + if args.per_files: + report_per_files(args.eloads) + else: + report_per_eload(args.eloads) + + +if __name__ == "__main__": + main()