From 566288f0be51f633cd8138d674e34a684b79b4b1 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 30 Jun 2024 20:11:35 +0200 Subject: [PATCH] Simplify bam decode function --- src/dnaio/bam.h | 52 +++++++++++++------------------------------------ 1 file changed, 14 insertions(+), 38 deletions(-) diff --git a/src/dnaio/bam.h b/src/dnaio/bam.h index 5d4e952..da5ace3 100644 --- a/src/dnaio/bam.h +++ b/src/dnaio/bam.h @@ -72,49 +72,25 @@ decode_bam_sequence_ssse3(uint8_t *dest, const uint8_t *encoded_sequence, size_t const uint8_t *dest_end_ptr = dest + length; uint8_t *dest_cursor = dest; const uint8_t *encoded_cursor = encoded_sequence; - const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i)); - __m128i first_upper_shuffle = _mm_setr_epi8( - 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff); - __m128i first_lower_shuffle = _mm_setr_epi8( - 0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7); - __m128i second_upper_shuffle = _mm_setr_epi8( - 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff); - __m128i second_lower_shuffle = _mm_setr_epi8( - 0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15); + const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i) - 1); __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)nuc_lookup); - /* Work on 16 encoded characters at the time resulting in 32 decoded characters - Examples are given for 8 encoded characters A until H to keep it readable. - Encoded stored as |AB|CD|EF|GH| - Shuffle into |AB|00|CD|00|EF|00|GH|00| and - |00|AB|00|CD|00|EF|00|GH| - Shift upper to the right resulting into - |0A|B0|0C|D0|0E|F0|0G|H0| and - |00|AB|00|CD|00|EF|00|GH| - Merge with or resulting into (X stands for garbage) - |0A|XB|0C|XD|0E|XF|0G|XH| - Bitwise and with 0b1111 leads to: - |0A|0B|0C|0D|0E|0F|0G|0H| - We can use the resulting 4-bit integers as indexes for the shuffle of - the nucleotide lookup. */ + /* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes + as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used + together with the pshufb instruction as a lookup table. The most efficient + the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|). + The lookup can then be performed and the resulting vectors can be + interleaved again using the unpack instructions. */ while (dest_cursor < dest_vec_end_ptr) { __m128i encoded = _mm_lddqu_si128((__m128i *)encoded_cursor); - - __m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle); - __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle); - __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4); - __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower); - __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111)); - __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes); + __m128i encoded_upper = _mm_srli_epi64(encoded, 4); + encoded_upper = _mm_and_si128(encoded_upper, _mm_set1_epi8(15)); + __m128i encoded_lower = _mm_and_si128(encoded, _mm_set1_epi8(15)); + __m128i nucs_upper = _mm_shuffle_epi8(nuc_lookup_vec, encoded_upper); + __m128i nucs_lower = _mm_shuffle_epi8(nuc_lookup_vec, encoded_lower); + __m128i first_nucleotides = _mm_unpacklo_epi8(nucs_upper, nucs_lower); + __m128i second_nucleotides = _mm_unpackhi_epi8(nucs_upper, nucs_lower); _mm_storeu_si128((__m128i *)dest_cursor, first_nucleotides); - - __m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle); - __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle); - __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4); - __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower); - __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111)); - __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes); _mm_storeu_si128((__m128i *)(dest_cursor + 16), second_nucleotides); - encoded_cursor += sizeof(__m128i); dest_cursor += 2 * sizeof(__m128i); }