Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reimplement Bitpacking::unpack_single #42

Merged
merged 17 commits into from
Jul 3, 2024
11 changes: 7 additions & 4 deletions benches/bitpacking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,16 @@ fn pack(c: &mut Criterion) {
let mut group = c.benchmark_group("unpack-single");
group.bench_function("unpack single 16 <- 3", |b| {
const WIDTH: usize = 3;
let values = [3u16; 1024];
let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
BitPacking::pack::<WIDTH>(&values, &mut packed);
let values = vec![3u16; 1024];
let mut packed = vec![0; 128 * WIDTH / size_of::<u16>()];
BitPacking::pack::<WIDTH>(array_ref![values, 0, 1024], array_mut_ref![packed, 0, 192]);

b.iter(|| {
for i in 0..1024 {
black_box::<u16>(BitPacking::unpack_single::<WIDTH>(&packed, i));
black_box::<u16>(BitPacking::unpack_single::<WIDTH>(
array_ref![packed, 0, 192],
i,
));
}
});
});
Expand Down
82 changes: 69 additions & 13 deletions src/bitpacking.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied};
use arrayref::{array_mut_ref, array_ref};
use core::mem::size_of;
use num_traits::One;
use paste::paste;

use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER};

pub struct BitPackWidth<const W: usize>;
pub trait SupportedBitPackWidth<T> {}
impl<const W: usize, T> SupportedBitPackWidth<T> for BitPackWidth<W> where
Expand Down Expand Up @@ -46,12 +46,57 @@ pub trait BitPacking: FastLanes {
fn unpack_single<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
where
BitPackWidth<W>: SupportedBitPackWidth<Self>,
Self: One,
{
// TODO(ngates): implement this function to not unpack the world.
let mut output = [Self::zero(); 1024];
Self::unpack::<W>(packed, &mut output);
output[index]
// Special case for W=0, since there's only one possible value.
if W == 0 {
return Self::zero();
}

// We can think of the input array as effectively a row-major, left-to-right
// 2-D array of with `Self::LANES` columns and `Self::T` rows.
//
// Meanwhile, we can think of the packed array as either:
// 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
// 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
//
// Bitpacking involves a transposition of the input array ordering, such that
// decompression can be fused efficiently with encodings like delta and RLE.
//
// First step, we need to get the lane and row for interpretation #1 above.
let lane = index % Self::LANES;
let row = {
// This is the inverse of the `index` function from the pack/unpack macros:
// fn index(row: usize, lane: usize) -> usize {
// let o = row / 8;
// let s = row % 8;
// (FL_ORDER[o] * 16) + (s * 128) + lane
// }
let s = index / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
let fl_order = (index - s * 128 - lane) / 16; // value of FL_ORDER[o]
let o = FL_ORDER[fl_order]; // because this transposition is invertible!
o * 8 + s
};

// From the row, we can get the correct start bit within the lane.
let start_bit = row * W;

// We need to read one or two T-bit words from the lane, depending on how our
// target W-bit value overlaps with the T-bit words. To avoid a branch, we
// always read two T-bit words, and then shift/mask as needed.
let lo_word = start_bit / Self::T;
let lo_shift = start_bit % Self::T;
let lo = packed[Self::LANES * lo_word + lane] >> lo_shift;

let hi_word = (start_bit + W - 1) / Self::T;
let hi_shift = (Self::T - lo_shift) % Self::T;
let hi = packed[Self::LANES * hi_word + lane] << hi_shift;

let mask: Self = if W == Self::T {
Self::max_value()
} else {
((Self::one()) << (W % Self::T)) - Self::one()
};
(lo | hi) & mask
}

/// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements,
Expand Down Expand Up @@ -144,14 +189,16 @@ macro_rules! impl_packing {

seq_t!(W in $T {
match width {
#(W => Self::unpack_single::<W>(
array_ref![input, 0, 1024 * W / <$T>::T],
index,
),)*
#(W => {
Self::unpack_single::<W>(
array_ref![input, 0, 1024 * W / <$T>::T],
index
)
})*
// seq_t has exclusive upper bound
Self::T => Self::unpack_single::<{ Self::T }>(
array_ref![input, 0, 1024],
index,
index
),
_ => unreachable!("Unsupported width: {}", width)
}
Expand All @@ -169,12 +216,13 @@ impl_packing!(u64);

#[cfg(test)]
mod test {
use super::*;
use core::array;
use core::fmt::Debug;
use core::mem::size_of;
use seq_macro::seq;

use super::*;

#[test]
fn test_unchecked_pack() {
let input = array::from_fn(|i| i as u32);
Expand Down Expand Up @@ -218,6 +266,14 @@ mod test {
BitPacking::unpack::<W>(&packed, &mut unpacked);

assert_eq!(&unpacked, &values);

for i in 0..1024 {
assert_eq!(BitPacking::unpack_single::<W>(&packed, i), values[i]);
assert_eq!(
unsafe { BitPacking::unchecked_unpack_single(W, &packed, i) },
values[i]
);
}
}

macro_rules! impl_try_round_trip {
Expand Down
13 changes: 13 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,16 @@ macro_rules! seq_t {
($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)};
($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)};
}

#[cfg(test)]
mod test {
use crate::FL_ORDER;

#[test]
fn test_ordering_is_own_inverse() {
// Check that FL_ORDER "round-trips"; i.e., it is its own inverse permutation.
for i in 0..8 {
assert_eq!(FL_ORDER[FL_ORDER[i]], i);
}
}
}
Loading