Merge commit 'b38d62e1b5542db5eb331e28989b918489b14c24'

steineggerlab · Jun 27, 2023 · c760a33 · c760a33
2 parents 401180f + b38d62e
commit c760a33
Show file tree

Hide file tree

Showing 38 changed files with 1,407 additions and 1,034 deletions.
diff --git a/lib/block-aligner/README.md b/lib/block-aligner/README.md
@@ -77,6 +77,7 @@ of ~10% sequence length performs well (tested with reads up to ~50kbps).
 For proteins, a min block size of 32 and a max block size of 256 performs well.
 Using a minimum block size that is at least 32 is recommended for most applications.
 Using a maximum block size greater than `2^14 = 16384` is not recommended.
+The library contains a `percent_len` function that computes a percentage of the sequence length with these recommendations.
 If the alignment scores are saturating (score too large), then use a smaller block size.
 Let me know how block aligner performs on your data!
 
@@ -136,7 +137,20 @@ the [C readme](c/README.md).
 See the `3di` branch for an example of using block aligner to do local alignment in C,
 along with block aligner modifications to support aligning with amino acid 3D interaction (3Di) information.
 
-Most of the instructions below are for benchmarking and testing block aligner.
+## Improving Block Aligner
+During alignment, three decisions need to be made at each step (using heuristics):
+* Whether to grow the block size
+* Whether to shrink the block size
+* Whether to shift right or down
+
+Block aligner uses simple greedy heuristics that are cheap to evaluate for making these decisions.
+There is probably a lot of room to improve here! Maybe seeds? Neural network models?
+
+To try your ideas, take a look at the code after the comment `// TODO: better heuristics?` in `src/scan_block.rs`
+(depending on your changes, you may need to modify other parts of the code too). Let me know if you
+are working on new ideas!
+
+**Most of the instructions below are for benchmarking and testing block aligner.**
 
 ## Data
 Some Illumina/Nanopore (DNA), Uniclust30 (protein), and SCOP (protein profile) data are used in some tests and benchmarks.
@@ -157,11 +171,13 @@ Run `scripts/doc_avx2.sh` or `scripts/doc_wasm.sh` to build the docs locally.
 
 ## Benchmark
 Run `scripts/bench_avx2.sh` or `scripts/bench_wasm.sh` for basic benchmarks.
-See the `scripts` directory for more benchmark scripts on real data.
+See the `scripts` directory for runnable benchmark scripts on real data.
+Most of the actual implementations of the benchmarks are in the `examples` directory.
 
 ## Data analysis and visualizations
 Use the Jupyter notebook in the `vis/` directory to gather data and plot them. An easier way
-to run the whole notebook is to run the `vis/run_vis.sh` script.
+to run the whole notebook is to run the `vis/run_vis.sh` script. This reproduces the
+experiments in the manuscript.
 
 ## Profiling with MacOS Instruments
 Use

diff --git a/lib/block-aligner/c/README.md b/lib/block-aligner/c/README.md
@@ -2,21 +2,23 @@
 This directory contains an example of how to use the C API of block aligner.
 
 Currently, only sequence to sequence and sequence to profile alignment with
-proteins is supported with the C API. Other features may be added if there
-is demand for them.
+amino acid scoring matrices is supported with the C API. However, this can easily be adapted
+to align nucleotides by setting custom match/mismatch scores.
+Other features may be added if there is demand for them.
 
 ## Running the example
 1. `cd` into this directory.
 2. Run `make`. This will build block aligner in release mode, use cbindgen
 to generate the header file, and make sure block aligner is linked to the
-example program. Make sure you have [cbindgen](https://github.com/eqrion/cbindgen) installed
-if you are making changes to the block aligner code and need to regenerate bindings.
+example program. You only need to have [cbindgen](https://github.com/eqrion/cbindgen) installed
+if you are making changes to the block aligner code and need to regenerate bindings. If you are only using
+the library, a generated header file is provided in this directory.
 3. Run `./a.out`. This will run the example program to perform alignment
 calculations.
 
 The generated header file, `c/block_aligner.h`, should be included in
 code that calls block aligner functions. It is C++ compatible.
-Like in the example `Makefile`, the `block_aligner` library in `c/target/release`
+Like in the example `Makefile`, the `block_aligner_c` library in `c/target/release`
 must be linked to any C/C++ code that calls block aligner functions.
 
 Note that this directory has a minimal `Cargo.toml` that has no dependencies, so

diff --git a/lib/block-aligner/examples/debug.rs b/lib/block-aligner/examples/debug.rs
@@ -41,6 +41,6 @@ fn main() {
         scan_cigar,
         a,
         b,
-        bio_alignment.pretty(&q, &r)
+        bio_alignment.pretty(&q, &r, 1_000_000_000)
     );
 }
diff --git a/lib/block-aligner/examples/nanopore_accuracy.rs b/lib/block-aligner/examples/nanopore_accuracy.rs
@@ -2,7 +2,7 @@
 fn main() {}
 
 #[cfg(feature = "simd_avx2")]
-fn test(file_name: &str, max_size: usize, name: &str, verbose: bool, writer: &mut impl std::io::Write) -> (usize, usize, usize, f64, usize, f64) {
+fn test(file_name: &str, max_size: usize, name: &str, verbose: bool, writer: &mut impl std::io::Write) -> (usize, usize, usize, f64, usize, f64, f64, usize, usize) {
     use parasailors::{Matrix, *};
 
     use rust_wfa2::aligner::*;
@@ -22,6 +22,9 @@ fn test(file_name: &str, max_size: usize, name: &str, verbose: bool, writer: &mu
     let mut wrong_avg = 0f64;
     let mut count = 0usize;
     let mut seq_id_avg = 0f64;
+    let mut largest_gap_avg = 0f64;
+    let mut largest_gap_max = 0usize;
+    let mut largest_gap_max_correct = 0usize;
     let reader = BufReader::new(File::open(file_name).unwrap());
     let all_lines = reader.lines().collect::<Vec<_>>();
 
@@ -61,16 +64,6 @@ fn test(file_name: &str, max_size: usize, name: &str, verbose: bool, writer: &mu
         block_aligner.align(&q_padded, &r_padded, &matrix, run_gaps, percent_len(max_len, 0.01)..=percent_len(max_len, 0.1), 0);
         let scan_score = block_aligner.res().score;
 
-        write!(
-            writer,
-            "{}, {}, {}, {}, {}\n",
-            name,
-            q.len(),
-            r.len(),
-            scan_score,
-            correct_score
-        ).unwrap();
-
         if correct_score != scan_score {
             wrong += 1;
             wrong_avg += ((correct_score - scan_score) as f64) / (correct_score as f64);
@@ -98,10 +91,11 @@ fn test(file_name: &str, max_size: usize, name: &str, verbose: bool, writer: &mu
 
         let wfa_adaptive_score = {
             let mut wfa = WFAlignerGapAffine::new(4, 4, 2, AlignmentScope::Score, MemoryModel::MemoryHigh);
-            wfa.set_heuristic(Heuristic::WFadaptive(10, 50, 1));
+            wfa.set_heuristic(Heuristic::WFadaptive(10, percent_len(q.len().max(r.len()), 0.01) as i32, 1));
             wfa.align_end_to_end(q.as_bytes(), r.as_bytes());
             wfa.score()
         };
+        let largest_gap;
         let wfa_score = {
             let mut wfa = WFAlignerGapAffine::new(4, 4, 2, AlignmentScope::Alignment, MemoryModel::MemoryHigh);
             wfa.set_heuristic(Heuristic::None);
@@ -110,16 +104,54 @@ fn test(file_name: &str, max_size: usize, name: &str, verbose: bool, writer: &mu
             let matches = cigar.bytes().filter(|&c| c == b'M').count();
             let seq_id = (matches as f64) / (cigar.len() as f64);
             seq_id_avg += seq_id;
+            largest_gap = get_largest_gap(cigar.as_bytes());
+            largest_gap_avg += largest_gap as f64;
+            largest_gap_max = largest_gap_max.max(largest_gap);
+            if scan_score == correct_score {
+                largest_gap_max_correct = largest_gap_max_correct.max(largest_gap);
+            }
             wfa.score()
         };
         if wfa_adaptive_score != wfa_score {
             wfa_wrong += 1;
         }
 
+        write!(
+            writer,
+            "{}, {}, {}, {}, {}, {}\n",
+            name,
+            q.len(),
+            r.len(),
+            largest_gap,
+            scan_score,
+            correct_score
+        ).unwrap();
+
         count += 1;
     }
 
-    (wrong, min_size_wrong, wfa_wrong, wrong_avg / (wrong as f64), count, seq_id_avg / (count as f64))
+    (wrong, min_size_wrong, wfa_wrong, wrong_avg / (wrong as f64), count, seq_id_avg / (count as f64), largest_gap_avg / (count as f64), largest_gap_max, largest_gap_max_correct)
+}
+
+#[cfg(feature = "simd_avx2")]
+fn get_largest_gap(cigar: &[u8]) -> usize {
+    let mut prev = b'?';
+    let mut curr = 0;
+    let mut max = 0;
+
+    for &op in cigar {
+        if op != prev {
+            prev = op;
+            curr = 0;
+        }
+
+        if op == b'I' || op == b'D' {
+            curr += 1;
+            max = max.max(curr);
+        }
+    }
+
+    max
 }
 
 #[cfg(feature = "simd_avx2")]
@@ -136,14 +168,15 @@ fn main() {
 
     let out_file_name = "data/nanopore_accuracy.csv";
     let mut writer = BufWriter::new(File::create(out_file_name).unwrap());
-    write!(writer, "dataset, query len, reference len, pred score, true score\n").unwrap();
+    write!(writer, "dataset, query len, reference len, largest gap, pred score, true score\n").unwrap();
 
     println!("\ndataset, total, wrong, wrong % error, min size wrong, wfa wrong");
 
     for ((path, name), &max_size) in paths.iter().zip(&names).zip(&max_size) {
-        let (wrong, min_size_wrong, wfa_wrong, wrong_avg, count, seq_id_avg) = test(path, max_size, name, verbose, &mut writer);
+        let (wrong, min_size_wrong, wfa_wrong, wrong_avg, count, seq_id_avg, largest_gap_avg, largest_gap_max, largest_gap_max_correct) = test(path, max_size, name, verbose, &mut writer);
         println!("\n{}, {}, {}, {}, {}, {}", name, count, wrong, wrong_avg, min_size_wrong, wfa_wrong);
         println!("# {} seq id avg: {}", name, seq_id_avg);
+        println!("# {} largest gap avg: {}, largest gap max: {}, largest gap max for correct: {}", name, largest_gap_avg, largest_gap_max, largest_gap_max_correct);
     }
 
     println!("# Done!");

diff --git a/lib/block-aligner/examples/nanopore_bench_global.rs b/lib/block-aligner/examples/nanopore_bench_global.rs
@@ -13,6 +13,7 @@ use ksw2_sys::*;
 use block_aligner::percent_len;
 use block_aligner::scan_block::*;
 use block_aligner::scores::*;
+use block_aligner::cigar::*;
 
 use std::fs::File;
 use std::io::{BufRead, BufReader};
@@ -61,19 +62,21 @@ fn bench_parasailors(file: &str) -> f64 {
 fn bench_wfa2(file: &str, use_heuristic: bool) -> f64 {
     let data = get_data(file);
 
+    let mut wfa = WFAlignerGapAffine::new(4, 4, 2, AlignmentScope::Alignment, MemoryModel::MemoryHigh);
+
     let mut total_time = 0f64;
     let mut temp = 0i32;
     for (q, r) in &data {
-        let mut wfa = WFAlignerGapAffine::new(4, 4, 2, AlignmentScope::Score, MemoryModel::MemoryHigh);
         if use_heuristic {
-            wfa.set_heuristic(Heuristic::WFadaptive(10, 50, 1));
+            wfa.set_heuristic(Heuristic::WFadaptive(10, percent_len(q.len().max(r.len()), 0.01) as i32, 1));
         } else {
             wfa.set_heuristic(Heuristic::None);
         }
         let start = Instant::now();
         wfa.align_end_to_end(&q, &r);
         total_time += start.elapsed().as_secs_f64();
         temp = temp.wrapping_add(wfa.score());
+        temp = temp.wrapping_add(wfa.cigar().len() as i32);
     }
     black_box(temp);
     total_time
@@ -86,10 +89,13 @@ fn bench_edlib(file: &str) -> f64 {
     let mut total_time = 0f64;
     let mut temp = 0i32;
     for (q, r) in &data {
+        let mut config = EdlibAlignConfigRs::default();
+        config.task = EdlibAlignTaskRs::EDLIB_TASK_PATH;
         let start = Instant::now();
-        let res = edlibAlignRs(&q, &r, &EdlibAlignConfigRs::default());
+        let res = edlibAlignRs(&q, &r, &config);
         total_time += start.elapsed().as_secs_f64();
         temp = temp.wrapping_add(res.editDistance);
+        temp = temp.wrapping_add(res.alignment.unwrap().len() as i32);
     }
     black_box(temp);
     total_time
@@ -125,10 +131,11 @@ fn bench_ksw2(file: &str, band_width_percent: f32) -> f64 {
         let band_width = percent_len(q.len().max(r.len()), band_width_percent) as i32;
         let start = Instant::now();
         unsafe {
-            ksw_extz2_sse(std::ptr::null_mut(), q.len() as i32, q.as_ptr(), r.len() as i32, r.as_ptr(), 5, matrix.as_ptr(), 4, 2, band_width, -1, 0, 1, &mut res);
+            ksw_extz2_sse(std::ptr::null_mut(), q.len() as i32, q.as_ptr(), r.len() as i32, r.as_ptr(), 5, matrix.as_ptr(), 4, 2, band_width, -1, 0, 0 /* score only = 1 */, &mut res);
         }
         total_time += start.elapsed().as_secs_f64();
         temp = temp.wrapping_add(res.score);
+        temp = temp.wrapping_add(res.n_cigar);
     }
     black_box(temp);
     total_time
@@ -151,10 +158,13 @@ fn bench_ours(file: &str, trace: bool, max_size: usize, block_grow: bool) -> f64
 
         if trace {
             let mut a = Block::<true, false>::new(q.len(), r.len(), max_size);
+            let mut cigar = Cigar::new(q.len(), r.len());
             let start = Instant::now();
             a.align(&q, &r, &matrix, bench_gaps, percent_len(max_len, 0.01)..=percent_len(max_len, max_percent), 0);
+            a.trace().cigar(a.res().query_idx, a.res().reference_idx, &mut cigar);
             total_time += start.elapsed().as_secs_f64();
             temp = temp.wrapping_add(a.res().score); // prevent optimizations
+            temp = temp.wrapping_add(cigar.len() as i32); // prevent optimizations
         } else {
             let mut a = Block::<false, false>::new(q.len(), r.len(), max_size);
             let start = Instant::now();
@@ -179,7 +189,7 @@ fn main() {
 
     for (((file, name), max_size), &run_parasail) in files.iter().zip(&names).zip(&max_sizes).zip(&run_parasail_arr) {
         for (&s, &g) in max_size.iter().zip(&[false, true]) {
-            let t = bench_ours(file, false, s, g);
+            let t = bench_ours(file, true, s, g);
             println!("{}, ours ({}), {}", name, if g { "1%-10%" } else { "1%-1%" }, t);
         }
 

diff --git a/lib/block-aligner/examples/uc_accuracy.rs b/lib/block-aligner/examples/uc_accuracy.rs
@@ -73,7 +73,7 @@ fn test(file_name: &str, min_size: usize, max_size: usize, string: &str, verbose
                     q,
                     r.len(),
                     r,
-                    bio_alignment.pretty(q.as_bytes(), r.as_bytes()),
+                    bio_alignment.pretty(q.as_bytes(), r.as_bytes(), 1_000_000_000),
                     a_pretty,
                     b_pretty
                 );

diff --git a/lib/block-aligner/src/cigar.rs b/lib/block-aligner/src/cigar.rs
@@ -78,6 +78,11 @@ impl Cigar {
         (*self.s.as_mut_ptr().add(self.idx - 1)).len += 1;
     }
 
+    /// Reverse the CIGAR string in place.
+    pub fn reverse(&mut self) {
+        self.s[1..self.idx].reverse();
+    }
+
     /// Length of the CIGAR string, not including the first sentinel.
     pub fn len(&self) -> usize {
         self.idx - 1

diff --git a/lib/block-aligner/src/lib.rs b/lib/block-aligner/src/lib.rs
@@ -105,7 +105,7 @@ compile_error!("No SIMD feature flag specified! Specify \"no_simd\" to disable a
 /// Calculate the percentage of a length, rounded to the next power of two.
 ///
 /// This is useful for computing the min and max block sizes for sequences of a certain
-/// length by using percentages. The returned value is at least 32.
+/// length by using percentages. The returned value is at least 32 and at most 16384.
 pub fn percent_len(len: usize, p: f32) -> usize {
-    ((p * (len as f32)).round() as usize).max(32).next_power_of_two()
+    ((p * (len as f32)).round() as usize).max(32).next_power_of_two().min(1 << 14)
 }
diff --git a/lib/block-aligner/src/neon.rs b/lib/block-aligner/src/neon.rs
@@ -134,7 +134,7 @@ macro_rules! simd_sr_i16 {
 // hardcoded to STEP = 8
 #[target_feature(enable = "neon")]
 #[inline]
-pub unsafe fn simd_step(a: Simd, b: Simd) -> Simd {
+pub unsafe fn simd_step(a: Simd, _b: Simd) -> Simd {
     a
 }