Skip to content

Commit

Permalink
Merge pull request #135 from rhpvorderman/betternibbledecode
Browse files Browse the repository at this point in the history
Decode BAM sequence with less code. Dispatch function at dynamic link time rather than at run time.
  • Loading branch information
marcelm committed Jul 17, 2024
2 parents d19338d + d43c6fc commit c89d133
Showing 1 changed file with 24 additions and 57 deletions.
81 changes: 24 additions & 57 deletions src/dnaio/bam.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#endif

#define COMPILER_HAS_TARGET (GCC_AT_LEAST(4, 8) || CLANG_COMPILER_HAS(__target__))
#define COMPILER_HAS_CONSTRUCTOR (__GNUC__ || CLANG_COMPILER_HAS(constructor))
#define COMPILER_HAS_OPTIMIZE (GCC_AT_LEAST(4,4) || CLANG_COMPILER_HAS(optimize))

#if defined(__x86_64__) || defined(_M_X64)
Expand Down Expand Up @@ -56,7 +57,11 @@ decode_bam_sequence_default(uint8_t *dest, const uint8_t *encoded_sequence, size
}
}

#if COMPILER_HAS_TARGET && BUILD_IS_X86_64
static void (*decode_bam_sequence)(
uint8_t *dest, const uint8_t *encoded_sequence, size_t length
) = decode_bam_sequence_default;

#if COMPILER_HAS_TARGET && COMPILER_HAS_CONSTRUCTOR && BUILD_IS_X86_64
__attribute__((__target__("ssse3")))
static void
decode_bam_sequence_ssse3(uint8_t *dest, const uint8_t *encoded_sequence, size_t length)
Expand All @@ -66,79 +71,41 @@ decode_bam_sequence_ssse3(uint8_t *dest, const uint8_t *encoded_sequence, size_t
const uint8_t *dest_end_ptr = dest + length;
uint8_t *dest_cursor = dest;
const uint8_t *encoded_cursor = encoded_sequence;
const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i));
__m128i first_upper_shuffle = _mm_setr_epi8(
0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff);
__m128i first_lower_shuffle = _mm_setr_epi8(
0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7);
__m128i second_upper_shuffle = _mm_setr_epi8(
8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff);
__m128i second_lower_shuffle = _mm_setr_epi8(
0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15);
const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i) - 1);
__m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)nuc_lookup);
/* Work on 16 encoded characters at the time resulting in 32 decoded characters
Examples are given for 8 encoded characters A until H to keep it readable.
Encoded stored as |AB|CD|EF|GH|
Shuffle into |AB|00|CD|00|EF|00|GH|00| and
|00|AB|00|CD|00|EF|00|GH|
Shift upper to the right resulting into
|0A|B0|0C|D0|0E|F0|0G|H0| and
|00|AB|00|CD|00|EF|00|GH|
Merge with or resulting into (X stands for garbage)
|0A|XB|0C|XD|0E|XF|0G|XH|
Bitwise and with 0b1111 leads to:
|0A|0B|0C|0D|0E|0F|0G|0H|
We can use the resulting 4-bit integers as indexes for the shuffle of
the nucleotide lookup. */
/* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes
as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used
together with the pshufb instruction as a lookup table. The most efficient
the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|).
The lookup can then be performed and the resulting vectors can be
interleaved again using the unpack instructions. */
while (dest_cursor < dest_vec_end_ptr) {
__m128i encoded = _mm_lddqu_si128((__m128i *)encoded_cursor);

__m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
__m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
__m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
__m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
__m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111));
__m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
__m128i encoded_upper = _mm_srli_epi64(encoded, 4);
encoded_upper = _mm_and_si128(encoded_upper, _mm_set1_epi8(15));
__m128i encoded_lower = _mm_and_si128(encoded, _mm_set1_epi8(15));
__m128i nucs_upper = _mm_shuffle_epi8(nuc_lookup_vec, encoded_upper);
__m128i nucs_lower = _mm_shuffle_epi8(nuc_lookup_vec, encoded_lower);
__m128i first_nucleotides = _mm_unpacklo_epi8(nucs_upper, nucs_lower);
__m128i second_nucleotides = _mm_unpackhi_epi8(nucs_upper, nucs_lower);
_mm_storeu_si128((__m128i *)dest_cursor, first_nucleotides);

__m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
__m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
__m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
__m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
__m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111));
__m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
_mm_storeu_si128((__m128i *)(dest_cursor + 16), second_nucleotides);

encoded_cursor += sizeof(__m128i);
dest_cursor += 2 * sizeof(__m128i);
}
decode_bam_sequence_default(dest_cursor, encoded_cursor, dest_end_ptr - dest_cursor);
}

static void (*decode_bam_sequence)(
uint8_t *dest, const uint8_t *encoded_sequence, size_t length);

/* Simple dispatcher function, updates the function pointer after testing the
CPU capabilities. After this, the dispatcher function is not needed anymore. */
static void decode_bam_sequence_dispatch(
uint8_t *dest, const uint8_t *encoded_sequence, size_t length) {
/* Constructor functions run at dynamic link time. This checks the CPU capabilities
and updates the function pointer accordingly. */
__attribute__((constructor))
static void decode_bam_sequence_dispatch(void) {
if (__builtin_cpu_supports("ssse3")) {
decode_bam_sequence = decode_bam_sequence_ssse3;
}
else {
decode_bam_sequence = decode_bam_sequence_default;
}
decode_bam_sequence(dest, encoded_sequence, length);
}

static void (*decode_bam_sequence)(
uint8_t *dest, const uint8_t *encoded_sequence, size_t length
) = decode_bam_sequence_dispatch;

#else
static inline void decode_bam_sequence(uint8_t *dest, const uint8_t *encoded_sequence, size_t length)
{
decode_bam_sequence_default(dest, encoded_sequence, length);
}
#endif

Expand Down

0 comments on commit c89d133

Please sign in to comment.