diff --git a/benches/bitpacking.rs b/benches/bitpacking.rs
index 444769d..f29402d 100644
--- a/benches/bitpacking.rs
+++ b/benches/bitpacking.rs
@@ -48,13 +48,16 @@ fn pack(c: &mut Criterion) {
         let mut group = c.benchmark_group("unpack-single");
         group.bench_function("unpack single 16 <- 3", |b| {
             const WIDTH: usize = 3;
-            let values = [3u16; 1024];
-            let mut packed = [0; 128 * WIDTH / size_of::<u16>()];
-            BitPacking::pack::<WIDTH>(&values, &mut packed);
+            let values = vec![3u16; 1024];
+            let mut packed = vec![0; 128 * WIDTH / size_of::<u16>()];
+            BitPacking::pack::<WIDTH>(array_ref![values, 0, 1024], array_mut_ref![packed, 0, 192]);
 
             b.iter(|| {
                 for i in 0..1024 {
-                    black_box::<u16>(BitPacking::unpack_single::<WIDTH>(&packed, i));
+                    black_box::<u16>(BitPacking::unpack_single::<WIDTH>(
+                        array_ref![packed, 0, 192],
+                        i,
+                    ));
                 }
             });
         });
diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index 739485f..89c2e32 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -1,9 +1,9 @@
-use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied};
 use arrayref::{array_mut_ref, array_ref};
 use core::mem::size_of;
-use num_traits::One;
 use paste::paste;
 
+use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER};
+
 pub struct BitPackWidth<const W: usize>;
 pub trait SupportedBitPackWidth<T> {}
 impl<const W: usize, T> SupportedBitPackWidth<T> for BitPackWidth<W> where
@@ -46,12 +46,57 @@ pub trait BitPacking: FastLanes {
     fn unpack_single<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
     where
         BitPackWidth<W>: SupportedBitPackWidth<Self>,
-        Self: One,
     {
-        // TODO(ngates): implement this function to not unpack the world.
-        let mut output = [Self::zero(); 1024];
-        Self::unpack::<W>(packed, &mut output);
-        output[index]
+        // Special case for W=0, since there's only one possible value.
+        if W == 0 {
+            return Self::zero();
+        }
+
+        // We can think of the input array as effectively a row-major, left-to-right
+        // 2-D array of with `Self::LANES` columns and `Self::T` rows.
+        //
+        // Meanwhile, we can think of the packed array as either:
+        //      1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
+        //      2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
+        //
+        // Bitpacking involves a transposition of the input array ordering, such that
+        // decompression can be fused efficiently with encodings like delta and RLE.
+        //
+        // First step, we need to get the lane and row for interpretation #1 above.
+        let lane = index % Self::LANES;
+        let row = {
+            // This is the inverse of the `index` function from the pack/unpack macros:
+            //     fn index(row: usize, lane: usize) -> usize {
+            //         let o = row / 8;
+            //         let s = row % 8;
+            //         (FL_ORDER[o] * 16) + (s * 128) + lane
+            //     }
+            let s = index / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
+            let fl_order = (index - s * 128 - lane) / 16; // value of FL_ORDER[o]
+            let o = FL_ORDER[fl_order]; // because this transposition is invertible!
+            o * 8 + s
+        };
+
+        // From the row, we can get the correct start bit within the lane.
+        let start_bit = row * W;
+
+        // We need to read one or two T-bit words from the lane, depending on how our
+        // target W-bit value overlaps with the T-bit words. To avoid a branch, we
+        // always read two T-bit words, and then shift/mask as needed.
+        let lo_word = start_bit / Self::T;
+        let lo_shift = start_bit % Self::T;
+        let lo = packed[Self::LANES * lo_word + lane] >> lo_shift;
+
+        let hi_word = (start_bit + W - 1) / Self::T;
+        let hi_shift = (Self::T - lo_shift) % Self::T;
+        let hi = packed[Self::LANES * hi_word + lane] << hi_shift;
+
+        let mask: Self = if W == Self::T {
+            Self::max_value()
+        } else {
+            ((Self::one()) << (W % Self::T)) - Self::one()
+        };
+        (lo | hi) & mask
     }
 
     /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements,
@@ -144,14 +189,16 @@ macro_rules! impl_packing {
 
                     seq_t!(W in $T {
                         match width {
-                            #(W => Self::unpack_single::<W>(
-                                array_ref![input, 0, 1024 * W / <$T>::T],
-                                index,
-                            ),)*
+                            #(W => {
+                                Self::unpack_single::<W>(
+                                    array_ref![input, 0, 1024 * W / <$T>::T],
+                                    index
+                                )
+                            })*
                             // seq_t has exclusive upper bound
                             Self::T => Self::unpack_single::<{ Self::T }>(
                                 array_ref![input, 0, 1024],
-                                index,
+                                index
                             ),
                             _ => unreachable!("Unsupported width: {}", width)
                         }
@@ -169,12 +216,13 @@ impl_packing!(u64);
 
 #[cfg(test)]
 mod test {
-    use super::*;
     use core::array;
     use core::fmt::Debug;
     use core::mem::size_of;
     use seq_macro::seq;
 
+    use super::*;
+
     #[test]
     fn test_unchecked_pack() {
         let input = array::from_fn(|i| i as u32);
@@ -218,6 +266,14 @@ mod test {
         BitPacking::unpack::<W>(&packed, &mut unpacked);
 
         assert_eq!(&unpacked, &values);
+
+        for i in 0..1024 {
+            assert_eq!(BitPacking::unpack_single::<W>(&packed, i), values[i]);
+            assert_eq!(
+                unsafe { BitPacking::unchecked_unpack_single(W, &packed, i) },
+                values[i]
+            );
+        }
     }
 
     macro_rules! impl_try_round_trip {
diff --git a/src/lib.rs b/src/lib.rs
index 1e66cf2..4bf858d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -45,3 +45,16 @@ macro_rules! seq_t {
     ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)};
     ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)};
 }
+
+#[cfg(test)]
+mod test {
+    use crate::FL_ORDER;
+
+    #[test]
+    fn test_ordering_is_own_inverse() {
+        // Check that FL_ORDER "round-trips"; i.e., it is its own inverse permutation.
+        for i in 0..8 {
+            assert_eq!(FL_ORDER[FL_ORDER[i]], i);
+        }
+    }
+}