From 085437c3699975d02d6ff994e8d0034a2376cd2e Mon Sep 17 00:00:00 2001 From: sdewin Date: Fri, 9 Feb 2024 11:32:55 +0100 Subject: [PATCH 1/5] Add test for feature --- tests/split/cell_type_annotation.tsv | 1 - ..._type_annotation_one_bc_multiple_types.tsv | 30 ++++++++ tests/split/test_split.py | 68 ++++++++++++++----- 3 files changed, 82 insertions(+), 17 deletions(-) create mode 100644 tests/split/cell_type_annotation_one_bc_multiple_types.tsv diff --git a/tests/split/cell_type_annotation.tsv b/tests/split/cell_type_annotation.tsv index 0b692c3..a82b8f0 100644 --- a/tests/split/cell_type_annotation.tsv +++ b/tests/split/cell_type_annotation.tsv @@ -1,6 +1,5 @@ sample cell_type cell_barcode A type_1 TTAGCTTAGGAGAACA-1 -A type_1 TTAGCTTAGGAGAACA-1 A type_1 ATATTCCTCTTGTACT-1 A type_2 TGTGACAGTACAACGG-1 A type_2 CATGCCTTCTCTGACC-1 diff --git a/tests/split/cell_type_annotation_one_bc_multiple_types.tsv b/tests/split/cell_type_annotation_one_bc_multiple_types.tsv new file mode 100644 index 0000000..b1c398c --- /dev/null +++ b/tests/split/cell_type_annotation_one_bc_multiple_types.tsv @@ -0,0 +1,30 @@ +sample cell_type cell_barcode +A type_1 TTAGCTTAGGAGAACA-1 +A type_1 ATATTCCTCTTGTACT-1 +A type_2 TTAGCTTAGGAGAACA-1 +A type_2 CATGCCTTCTCTGACC-1 +A type_2 ATCGAGTAGGTTCGAG-1 +A type_3 CTCTCAGGTCCCTTTG-1 +A type_3 TTCGGTCTCACGTGTA-1 +A type_3 GTGACATCATTGTTCT-1 +A type_4 AAGGAGCCATCGACCG-1 +A type_4 ACCAAACTCTTAAGCG-1 +A type_4 CATTGGATCTCTTCCT-1 +A type_5 AGGCGAAAGGTCTTTG-1 +A type_5 AACGAGGCATCATGTG-1 +A type_5 CTACTTAGTCATGAGG-1 +B type_1 ATTACCTGTGTGCTTA-1 +B type_1 CATAACGTCGGTTGTA-1 +B type_1 ATGTCTTTCGGTCCGA-1 +B type_2 CAATCCCGTAGCGTTT-1 +B type_2 GCCATAATCATCGCTC-1 +B type_2 CAACGTAAGGCAAGGG-1 +B type_3 TATCGAGGTTGCCGCA-1 +B type_3 CATTCCGAGCTAACAA-1 +B type_3 CCTCCCTGTAAAGCTA-1 +B type_4 CAAGCTACACGTTGTA-1 +B type_4 CTGCTCATCACAAGCT-1 +B type_4 CAGTATGGTTCTTTGT-1 +B type_5 TTGCACCCATTAGCAC-1 +B type_5 CAGCCTTCATCTCTCG-1 +B type_5 TTGCGGGTCGTCAACA-1 diff --git a/tests/split/test_split.py b/tests/split/test_split.py index ed95207..b2c5fa0 100644 --- a/tests/split/test_split.py +++ b/tests/split/test_split.py @@ -6,48 +6,77 @@ TEST_DIRECTORY = pathlib.Path(__file__).parent.absolute() +FILES_ALL_BARCODES_MAPPING_TO_SINGLE_TYPE = { + "a.fragments": "a.fragments.tsv.gz", + "a.fragment_index": "a.fragments.tsv.gz.tbi", + "b.fragments": "b.fragments.tsv.gz", + "b.fragment_index": "b.fragments.tsv.gz.tbi", + "sample_to_fragment": "sample_to_fragment.tsv", + "cell_type_annotation": "cell_type_annotation.tsv", + "chrom_sizes": "hg38.chrom.sizes" +} + +FILES_SOME_BARCODES_MAPPING_TO_MULTIPLE_TYPES = { + "a.fragments": "a.fragments.tsv.gz", + "a.fragment_index": "a.fragments.tsv.gz.tbi", + "b.fragments": "b.fragments.tsv.gz", + "b.fragment_index": "b.fragments.tsv.gz.tbi", + "sample_to_fragment": "sample_to_fragment.tsv", + "cell_type_annotation": "cell_type_annotation_one_bc_multiple_types.tsv", + "chrom_sizes": "hg38.chrom.sizes" +} + def test_entrypoint(): exit_status = os.system("scatac_fragment_tools split") assert exit_status == 0 -def run_split_command(tmp_path, output_folder): - os.system(f"cp {TEST_DIRECTORY}/a.fragments.tsv.gz {tmp_path}") - os.system(f"cp {TEST_DIRECTORY}/a.fragments.tsv.gz.tbi {tmp_path}") - os.system(f"cp {TEST_DIRECTORY}/b.fragments.tsv.gz {tmp_path}") - os.system(f"cp {TEST_DIRECTORY}/b.fragments.tsv.gz.tbi {tmp_path}") - os.system(f"cp {TEST_DIRECTORY}/sample_to_fragment.tsv {tmp_path}") - os.system(f"cp {TEST_DIRECTORY}/cell_type_annotation.tsv {tmp_path}") - os.system(f"cp {TEST_DIRECTORY}/hg38.chrom.sizes {tmp_path}") +def run_split_command(tmp_path, output_folder, file_dict): + path_to_a_fragments = os.path.join(TEST_DIRECTORY, file_dict["a.fragments"]) + path_to_a_fragment_index = os.path.join(TEST_DIRECTORY, file_dict["a.fragment_index"]) + path_to_b_fragments = os.path.join(TEST_DIRECTORY, file_dict["b.fragments"]) + path_to_b_fragment_index = os.path.join(TEST_DIRECTORY, file_dict["b.fragment_index"]) + path_to_sample_to_fragment = os.path.join(TEST_DIRECTORY, file_dict["sample_to_fragment"]) + path_to_cell_type_annotation = os.path.join(TEST_DIRECTORY, file_dict["cell_type_annotation"]) + path_to_chrom_sizes = os.path.join(TEST_DIRECTORY, file_dict["chrom_sizes"]) + os.system(f"cp {path_to_a_fragments} {tmp_path}") + os.system(f"cp {path_to_a_fragment_index} {tmp_path}") + os.system(f"cp {path_to_b_fragments} {tmp_path}") + os.system(f"cp {path_to_b_fragment_index} {tmp_path}") + os.system(f"cp {path_to_sample_to_fragment} {tmp_path}") + os.system(f"cp {path_to_cell_type_annotation} {tmp_path}") + os.system(f"cp {path_to_chrom_sizes} {tmp_path}") + COMMAND = f"""cd {tmp_path} && \ scatac_fragment_tools split \ - -f {tmp_path}/sample_to_fragment.tsv \ - -b {tmp_path}/cell_type_annotation.tsv \ - -c {tmp_path}/hg38.chrom.sizes \ + -f {path_to_sample_to_fragment} \ + -b {path_to_cell_type_annotation} \ + -c {path_to_chrom_sizes} \ -o {output_folder} \ -t {tmp_path} \ """ return os.system(COMMAND) -def test_split_command(tmp_path): +def split_command_test_helper(tmp_path, file_dict): output_folder = os.path.join(tmp_path, "output") os.makedirs(output_folder, exist_ok=True) - exit_status = run_split_command(tmp_path, output_folder) + exit_status = run_split_command(tmp_path, output_folder, file_dict) assert exit_status == 0 a_fragments = pl.read_csv( - TEST_DIRECTORY.joinpath("a.fragments.tsv.gz"), + TEST_DIRECTORY.joinpath(file_dict["a.fragments"]), separator = "\t", has_header = False ) b_fragments = pl.read_csv( - TEST_DIRECTORY.joinpath("b.fragments.tsv.gz"), + TEST_DIRECTORY.joinpath(file_dict["b.fragments"]), separator = "\t", has_header = False ) cell_annotations = pl.read_csv( - TEST_DIRECTORY.joinpath("cell_type_annotation.tsv"), + TEST_DIRECTORY.joinpath(file_dict["cell_type_annotation"]), separator = "\t" ) + for row in cell_annotations \ .select(pl.col("cell_type")) \ .unique() \ @@ -72,3 +101,10 @@ def test_split_command(tmp_path): generated_fragments_cell_type ) + +def test_split_command_bc_single_type(tmp_path): + split_command_test_helper(tmp_path, FILES_ALL_BARCODES_MAPPING_TO_SINGLE_TYPE) + +def test_split_command_barcode_mapping_multiple_types(tmp_path): + split_command_test_helper(tmp_path, FILES_SOME_BARCODES_MAPPING_TO_MULTIPLE_TYPES) + From 3fc6ee08262a30baa52083cbadc6a4403da0e13a Mon Sep 17 00:00:00 2001 From: SeppeDeWinter Date: Fri, 9 Feb 2024 12:02:53 +0100 Subject: [PATCH 2/5] Implement feature in rust code --- rust/src/lib.rs | 7 +++++-- rust/src/split_fragments.rs | 18 ++++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 138dbcf..cbbb309 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -44,10 +44,13 @@ fn split_fragments_by_cell_barcode( verbose: bool, ) -> PyResult<()> { // Invert cell_type_to_cell_barcodes - let mut cell_barcode_to_cell_type: HashMap = HashMap::new(); + let mut cell_barcode_to_cell_type: HashMap> = HashMap::new(); for (cell_type, cell_barcodes) in cell_type_to_cell_barcodes.iter() { for cell_barcode in cell_barcodes.iter() { - cell_barcode_to_cell_type.insert(cell_barcode.to_string(), cell_type.to_string()); + cell_barcode_to_cell_type + .entry(cell_barcode.to_string()) + .or_insert(Vec::new()) + .push(cell_type.to_string()); } } split_fragments::split_fragments_by_cell_barcode( diff --git a/rust/src/split_fragments.rs b/rust/src/split_fragments.rs index 031585d..e7772ed 100644 --- a/rust/src/split_fragments.rs +++ b/rust/src/split_fragments.rs @@ -83,7 +83,7 @@ fn sanitize_string_for_filename(s: String) -> String { pub fn split_fragments_by_cell_barcode( path_to_fragments: &String, path_to_output_folder: &String, - cell_barcode_to_cell_type: HashMap, + cell_barcode_to_cell_type: HashMap>, chromsizes: HashMap, number_of_threads: u32, verbose: bool, @@ -101,7 +101,11 @@ pub fn split_fragments_by_cell_barcode( ) }); let mut cell_type_to_writer: HashMap<&String, LazyBgzfWriter> = HashMap::new(); - let unique_cell_types: Vec<&String> = cell_barcode_to_cell_type.values().unique().collect(); + let unique_cell_types: Vec<&String> = cell_barcode_to_cell_type + .values() + .flatten() + .unique() + .collect(); for cell_type in unique_cell_types { let cell_type_name = sanitize_string_for_filename(cell_type.clone().to_string()); let path_to_output = format!( @@ -145,10 +149,12 @@ pub fn split_fragments_by_cell_barcode( // loop over reads while not_at_end { let read_cb = read_as_str.split('\t').nth(3).unwrap().to_string(); - if let Some(cell_type) = cell_barcode_to_cell_type.get(&read_cb) { - let writer = cell_type_to_writer.get_mut(cell_type).unwrap(); - writer.write(&read).unwrap(); - writer.write(b"\n").unwrap(); + if let Some(cell_types) = cell_barcode_to_cell_type.get(&read_cb) { + for cell_type in cell_types { + let writer = cell_type_to_writer.get_mut(cell_type).unwrap(); + writer.write(&read).unwrap(); + writer.write(b"\n").unwrap(); + } } read.clear(); not_at_end = tbx_reader.read(&mut read).unwrap(); From 52b29d8745eddb5a19615e1c2843e654849b5acc Mon Sep 17 00:00:00 2001 From: Seppe Date: Fri, 9 Feb 2024 13:24:52 +0100 Subject: [PATCH 3/5] Add changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..d8fbb20 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,7 @@ +# Changelog + +## Unreleased + +### Added + +- **[004a265](https://github.com/aertslab/scatac_fragment_tools/commit/004a2654ecd5ed0a33be78f6fa5789c0a41deafb)**: Allow barcodes to map to multiple cell types while splitting fragments by cell type. \ No newline at end of file From eb332e48779aa14bfc783d365fc4609708ba4470 Mon Sep 17 00:00:00 2001 From: Seppe Date: Fri, 9 Feb 2024 13:55:31 +0100 Subject: [PATCH 4/5] Add changelog to pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9458e1f..0195a5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ homepage = "https://github.com/aertslab/scatac_fragment_tools/" documentation = "https://aertslab.github.io/scatac_fragment_tools/" repository = "https://github.com/aertslab/scatac_fragment_tools/" -changelog = "https://github.com/aertslab/scatac_fragment_tools/commits/main/" +changelog = "https://raw.githubusercontent.com/aertslab/scatac_fragment_tools/CHANGELOG.md" [project.optional-dependencies] development = [ From 1c72a122311ae9765dd8466ad0e6b63c8014ded3 Mon Sep 17 00:00:00 2001 From: Seppe Date: Wed, 27 Mar 2024 10:12:04 +0100 Subject: [PATCH 5/5] Change version to v0.1.1 --- CHANGELOG.md | 2 +- rust/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8fbb20..ce9bda2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## Unreleased +## v0.1.1 ### Added diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 2f1906e..d8b7c7c 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "_rust_scatac_fragment_tools" -version = "0.1.0" +version = "0.1.1" authors = ["Seppe De Winter ", "Gert Hulselmans "] edition = "2021"