Get beyond the immediate fuzzing failures (#611)

Overcome some float-related issues to keep the fuzzer running
spiraldb · Aug 13, 2024 · fdc025a · fdc025a
1 parent fc138b7
commit fdc025a
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 15 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -13,6 +13,7 @@ libfuzzer-sys = "0.4"
 vortex-array = { workspace = true, features = ["arbitrary"] }
 vortex-dtype = { workspace = true }
 vortex-sampling-compressor = { workspace = true }
+vortex-scalar = { workspace = true }
 
 
 [[bin]]

diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs
@@ -6,6 +6,7 @@ use libfuzzer_sys::arbitrary::{Arbitrary, Unstructured};
 use libfuzzer_sys::{fuzz_target, Corpus};
 use vortex::compute::slice;
 use vortex::compute::unary::scalar_at;
+use vortex::encoding::EncodingId;
 use vortex::Array;
 use vortex_sampling_compressor::compressors::alp::ALPCompressor;
 use vortex_sampling_compressor::compressors::bitpacked::BitPackedCompressor;
@@ -18,6 +19,7 @@ use vortex_sampling_compressor::compressors::sparse::SparseCompressor;
 use vortex_sampling_compressor::compressors::zigzag::ZigZagCompressor;
 use vortex_sampling_compressor::compressors::CompressorRef;
 use vortex_sampling_compressor::SamplingCompressor;
+use vortex_scalar::{PValue, Scalar, ScalarValue};
 
 fuzz_target!(|data: &[u8]| -> Corpus {
     let mut u = Unstructured::new(data);
@@ -92,8 +94,7 @@ fn assert_slice(original: &Array, slice: &Array, start: usize) {
         let o = scalar_at(original, start + idx).unwrap();
         let s = scalar_at(slice, idx).unwrap();
 
-        assert_eq!(o.value(), s.value());
-        assert_eq!(o.is_valid(), s.is_valid());
+        fuzzing_scalar_cmp(o, s, original.encoding().id(), slice.encoding().id(), idx);
     }
 }
 
@@ -103,13 +104,35 @@ fn assert_array_eq(lhs: &Array, rhs: &Array) {
         let l = scalar_at(lhs, idx).unwrap();
         let r = scalar_at(rhs, idx).unwrap();
 
-        assert_eq!(
-            l.value(),
-            r.value(),
-            "{l} != {r} at index {idx}, lhs is {} rhs is {}",
-            lhs.encoding().id(),
-            rhs.encoding().id()
-        );
-        assert_eq!(l.is_valid(), r.is_valid());
+        fuzzing_scalar_cmp(l, r, lhs.encoding().id(), rhs.encoding().id(), idx);
     }
 }
+
+fn fuzzing_scalar_cmp(
+    l: Scalar,
+    r: Scalar,
+    lhs_encoding: EncodingId,
+    rhs_encoding: EncodingId,
+    idx: usize,
+) {
+    let equal_values = match (l.value(), r.value()) {
+        (ScalarValue::Primitive(l), ScalarValue::Primitive(r))
+            if l.ptype().is_float() && r.ptype().is_float() =>
+        {
+            match (l, r) {
+                (PValue::F16(l), PValue::F16(r)) => l == r || (l.is_nan() && r.is_nan()),
+                (PValue::F32(l), PValue::F32(r)) => l == r || (l.is_nan() && r.is_nan()),
+                (PValue::F64(l), PValue::F64(r)) => l == r || (l.is_nan() && r.is_nan()),
+                _ => unreachable!(),
+            }
+        }
+        _ => l.value() == r.value(),
+    };
+
+    assert!(
+        equal_values,
+        "{l} != {r} at index {idx}, lhs is {} rhs is {}",
+        lhs_encoding, rhs_encoding
+    );
+    assert_eq!(l.is_valid(), r.is_valid());
+}
diff --git a/vortex-array/src/array/primitive/stats.rs b/vortex-array/src/array/primitive/stats.rs
@@ -131,6 +131,7 @@ struct StatsAccumulator<T: PStatsType> {
     is_strict_sorted: bool,
     run_count: usize,
     null_count: usize,
+    nan_count: usize,
     bit_widths: Vec<usize>,
     trailing_zeros: Vec<usize>,
     len: usize,
@@ -149,6 +150,7 @@ impl<T: PStatsType> StatsAccumulator<T> {
             bit_widths: vec![0; size_of::<T>() * 8 + 1],
             trailing_zeros: vec![0; size_of::<T>() * 8 + 1],
             len: 1,
+            nan_count: first_value.is_nan().then_some(1).unwrap_or_default(),
         };
         stats.bit_widths[first_value.bit_width() as usize] += 1;
         stats.trailing_zeros[first_value.trailing_zeros() as usize] += 1;
@@ -181,6 +183,10 @@ impl<T: PStatsType> StatsAccumulator<T> {
         self.trailing_zeros[next.trailing_zeros() as usize] += 1;
         self.len += 1;
 
+        if next.is_nan() {
+            self.nan_count += 1;
+        }
+
         if self.prev == next {
             self.is_strict_sorted = false;
         } else {
@@ -198,8 +204,9 @@ impl<T: PStatsType> StatsAccumulator<T> {
     }
 
     pub fn into_map(self) -> StatsSet {
-        let is_constant =
-            (self.min == self.max && self.null_count == 0) || self.null_count == self.len;
+        let is_constant = (self.min == self.max && self.null_count == 0 && self.nan_count == 0)
+            || self.null_count == self.len
+            || self.nan_count == self.len;
 
         StatsSet::from(HashMap::from([
             (Stat::Min, self.min.into()),

diff --git a/vortex-dtype/src/ptype.rs b/vortex-dtype/src/ptype.rs
@@ -46,12 +46,30 @@ pub trait NativePType:
     + TryFromBytes
 {
     const PTYPE: PType;
+
+    fn is_nan(self) -> bool;
 }
 
 macro_rules! native_ptype {
     ($T:ty, $ptype:tt) => {
         impl NativePType for $T {
             const PTYPE: PType = PType::$ptype;
+
+            fn is_nan(self) -> bool {
+                false
+            }
+        }
+    };
+}
+
+macro_rules! native_float_ptype {
+    ($T:ty, $ptype:tt) => {
+        impl NativePType for $T {
+            const PTYPE: PType = PType::$ptype;
+
+            fn is_nan(self) -> bool {
+                <$T>::is_nan(self)
+            }
         }
     };
 }
@@ -64,9 +82,9 @@ native_ptype!(i8, I8);
 native_ptype!(i16, I16);
 native_ptype!(i32, I32);
 native_ptype!(i64, I64);
-native_ptype!(f16, F16);
-native_ptype!(f32, F32);
-native_ptype!(f64, F64);
+native_float_ptype!(f16, F16);
+native_float_ptype!(f32, F32);
+native_float_ptype!(f64, F64);
 
 #[macro_export]
 macro_rules! match_each_native_ptype {