diff --git a/benches/bitpacking.rs b/benches/bitpacking.rs index 444769d..f29402d 100644 --- a/benches/bitpacking.rs +++ b/benches/bitpacking.rs @@ -48,13 +48,16 @@ fn pack(c: &mut Criterion) { let mut group = c.benchmark_group("unpack-single"); group.bench_function("unpack single 16 <- 3", |b| { const WIDTH: usize = 3; - let values = [3u16; 1024]; - let mut packed = [0; 128 * WIDTH / size_of::()]; - BitPacking::pack::(&values, &mut packed); + let values = vec![3u16; 1024]; + let mut packed = vec![0; 128 * WIDTH / size_of::()]; + BitPacking::pack::(array_ref![values, 0, 1024], array_mut_ref![packed, 0, 192]); b.iter(|| { for i in 0..1024 { - black_box::(BitPacking::unpack_single::(&packed, i)); + black_box::(BitPacking::unpack_single::( + array_ref![packed, 0, 192], + i, + )); } }); }); diff --git a/src/bitpacking.rs b/src/bitpacking.rs index 739485f..89c2e32 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -1,9 +1,9 @@ -use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied}; use arrayref::{array_mut_ref, array_ref}; use core::mem::size_of; -use num_traits::One; use paste::paste; +use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER}; + pub struct BitPackWidth; pub trait SupportedBitPackWidth {} impl SupportedBitPackWidth for BitPackWidth where @@ -46,12 +46,57 @@ pub trait BitPacking: FastLanes { fn unpack_single(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self where BitPackWidth: SupportedBitPackWidth, - Self: One, { - // TODO(ngates): implement this function to not unpack the world. - let mut output = [Self::zero(); 1024]; - Self::unpack::(packed, &mut output); - output[index] + // Special case for W=0, since there's only one possible value. + if W == 0 { + return Self::zero(); + } + + // We can think of the input array as effectively a row-major, left-to-right + // 2-D array of with `Self::LANES` columns and `Self::T` rows. + // + // Meanwhile, we can think of the packed array as either: + // 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns + // 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns + // + // Bitpacking involves a transposition of the input array ordering, such that + // decompression can be fused efficiently with encodings like delta and RLE. + // + // First step, we need to get the lane and row for interpretation #1 above. + let lane = index % Self::LANES; + let row = { + // This is the inverse of the `index` function from the pack/unpack macros: + // fn index(row: usize, lane: usize) -> usize { + // let o = row / 8; + // let s = row % 8; + // (FL_ORDER[o] * 16) + (s * 128) + lane + // } + let s = index / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 + let fl_order = (index - s * 128 - lane) / 16; // value of FL_ORDER[o] + let o = FL_ORDER[fl_order]; // because this transposition is invertible! + o * 8 + s + }; + + // From the row, we can get the correct start bit within the lane. + let start_bit = row * W; + + // We need to read one or two T-bit words from the lane, depending on how our + // target W-bit value overlaps with the T-bit words. To avoid a branch, we + // always read two T-bit words, and then shift/mask as needed. + let lo_word = start_bit / Self::T; + let lo_shift = start_bit % Self::T; + let lo = packed[Self::LANES * lo_word + lane] >> lo_shift; + + let hi_word = (start_bit + W - 1) / Self::T; + let hi_shift = (Self::T - lo_shift) % Self::T; + let hi = packed[Self::LANES * hi_word + lane] << hi_shift; + + let mask: Self = if W == Self::T { + Self::max_value() + } else { + ((Self::one()) << (W % Self::T)) - Self::one() + }; + (lo | hi) & mask } /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements, @@ -144,14 +189,16 @@ macro_rules! impl_packing { seq_t!(W in $T { match width { - #(W => Self::unpack_single::( - array_ref![input, 0, 1024 * W / <$T>::T], - index, - ),)* + #(W => { + Self::unpack_single::( + array_ref![input, 0, 1024 * W / <$T>::T], + index + ) + })* // seq_t has exclusive upper bound Self::T => Self::unpack_single::<{ Self::T }>( array_ref![input, 0, 1024], - index, + index ), _ => unreachable!("Unsupported width: {}", width) } @@ -169,12 +216,13 @@ impl_packing!(u64); #[cfg(test)] mod test { - use super::*; use core::array; use core::fmt::Debug; use core::mem::size_of; use seq_macro::seq; + use super::*; + #[test] fn test_unchecked_pack() { let input = array::from_fn(|i| i as u32); @@ -218,6 +266,14 @@ mod test { BitPacking::unpack::(&packed, &mut unpacked); assert_eq!(&unpacked, &values); + + for i in 0..1024 { + assert_eq!(BitPacking::unpack_single::(&packed, i), values[i]); + assert_eq!( + unsafe { BitPacking::unchecked_unpack_single(W, &packed, i) }, + values[i] + ); + } } macro_rules! impl_try_round_trip { diff --git a/src/lib.rs b/src/lib.rs index 1e66cf2..4bf858d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -45,3 +45,16 @@ macro_rules! seq_t { ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)}; ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)}; } + +#[cfg(test)] +mod test { + use crate::FL_ORDER; + + #[test] + fn test_ordering_is_own_inverse() { + // Check that FL_ORDER "round-trips"; i.e., it is its own inverse permutation. + for i in 0..8 { + assert_eq!(FL_ORDER[FL_ORDER[i]], i); + } + } +}