diff --git a/Cargo.lock b/Cargo.lock index 02d131950..37383b749 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4394,6 +4394,7 @@ dependencies = [ "vortex-array", "vortex-dtype", "vortex-sampling-compressor", + "vortex-scalar", ] [[package]] diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 40257cedc..522c25107 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -13,6 +13,7 @@ libfuzzer-sys = "0.4" vortex-array = { workspace = true, features = ["arbitrary"] } vortex-dtype = { workspace = true } vortex-sampling-compressor = { workspace = true } +vortex-scalar = { workspace = true } [[bin]] diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs index bc6c11c8c..2bc10f09d 100644 --- a/fuzz/fuzz_targets/fuzz_target_1.rs +++ b/fuzz/fuzz_targets/fuzz_target_1.rs @@ -6,6 +6,7 @@ use libfuzzer_sys::arbitrary::{Arbitrary, Unstructured}; use libfuzzer_sys::{fuzz_target, Corpus}; use vortex::compute::slice; use vortex::compute::unary::scalar_at; +use vortex::encoding::EncodingId; use vortex::Array; use vortex_sampling_compressor::compressors::alp::ALPCompressor; use vortex_sampling_compressor::compressors::bitpacked::BitPackedCompressor; @@ -18,6 +19,7 @@ use vortex_sampling_compressor::compressors::sparse::SparseCompressor; use vortex_sampling_compressor::compressors::zigzag::ZigZagCompressor; use vortex_sampling_compressor::compressors::CompressorRef; use vortex_sampling_compressor::SamplingCompressor; +use vortex_scalar::{PValue, Scalar, ScalarValue}; fuzz_target!(|data: &[u8]| -> Corpus { let mut u = Unstructured::new(data); @@ -92,8 +94,7 @@ fn assert_slice(original: &Array, slice: &Array, start: usize) { let o = scalar_at(original, start + idx).unwrap(); let s = scalar_at(slice, idx).unwrap(); - assert_eq!(o.value(), s.value()); - assert_eq!(o.is_valid(), s.is_valid()); + fuzzing_scalar_cmp(o, s, original.encoding().id(), slice.encoding().id(), idx); } } @@ -103,13 +104,35 @@ fn assert_array_eq(lhs: &Array, rhs: &Array) { let l = scalar_at(lhs, idx).unwrap(); let r = scalar_at(rhs, idx).unwrap(); - assert_eq!( - l.value(), - r.value(), - "{l} != {r} at index {idx}, lhs is {} rhs is {}", - lhs.encoding().id(), - rhs.encoding().id() - ); - assert_eq!(l.is_valid(), r.is_valid()); + fuzzing_scalar_cmp(l, r, lhs.encoding().id(), rhs.encoding().id(), idx); } } + +fn fuzzing_scalar_cmp( + l: Scalar, + r: Scalar, + lhs_encoding: EncodingId, + rhs_encoding: EncodingId, + idx: usize, +) { + let equal_values = match (l.value(), r.value()) { + (ScalarValue::Primitive(l), ScalarValue::Primitive(r)) + if l.ptype().is_float() && r.ptype().is_float() => + { + match (l, r) { + (PValue::F16(l), PValue::F16(r)) => l == r || (l.is_nan() && r.is_nan()), + (PValue::F32(l), PValue::F32(r)) => l == r || (l.is_nan() && r.is_nan()), + (PValue::F64(l), PValue::F64(r)) => l == r || (l.is_nan() && r.is_nan()), + _ => unreachable!(), + } + } + _ => l.value() == r.value(), + }; + + assert!( + equal_values, + "{l} != {r} at index {idx}, lhs is {} rhs is {}", + lhs_encoding, rhs_encoding + ); + assert_eq!(l.is_valid(), r.is_valid()); +} diff --git a/vortex-array/src/array/primitive/stats.rs b/vortex-array/src/array/primitive/stats.rs index 1ff370d3e..9971bc0a7 100644 --- a/vortex-array/src/array/primitive/stats.rs +++ b/vortex-array/src/array/primitive/stats.rs @@ -131,6 +131,7 @@ struct StatsAccumulator { is_strict_sorted: bool, run_count: usize, null_count: usize, + nan_count: usize, bit_widths: Vec, trailing_zeros: Vec, len: usize, @@ -149,6 +150,7 @@ impl StatsAccumulator { bit_widths: vec![0; size_of::() * 8 + 1], trailing_zeros: vec![0; size_of::() * 8 + 1], len: 1, + nan_count: first_value.is_nan().then_some(1).unwrap_or_default(), }; stats.bit_widths[first_value.bit_width() as usize] += 1; stats.trailing_zeros[first_value.trailing_zeros() as usize] += 1; @@ -181,6 +183,10 @@ impl StatsAccumulator { self.trailing_zeros[next.trailing_zeros() as usize] += 1; self.len += 1; + if next.is_nan() { + self.nan_count += 1; + } + if self.prev == next { self.is_strict_sorted = false; } else { @@ -198,8 +204,9 @@ impl StatsAccumulator { } pub fn into_map(self) -> StatsSet { - let is_constant = - (self.min == self.max && self.null_count == 0) || self.null_count == self.len; + let is_constant = (self.min == self.max && self.null_count == 0 && self.nan_count == 0) + || self.null_count == self.len + || self.nan_count == self.len; StatsSet::from(HashMap::from([ (Stat::Min, self.min.into()), diff --git a/vortex-dtype/src/ptype.rs b/vortex-dtype/src/ptype.rs index f91210a3d..28200112b 100644 --- a/vortex-dtype/src/ptype.rs +++ b/vortex-dtype/src/ptype.rs @@ -46,12 +46,30 @@ pub trait NativePType: + TryFromBytes { const PTYPE: PType; + + fn is_nan(self) -> bool; } macro_rules! native_ptype { ($T:ty, $ptype:tt) => { impl NativePType for $T { const PTYPE: PType = PType::$ptype; + + fn is_nan(self) -> bool { + false + } + } + }; +} + +macro_rules! native_float_ptype { + ($T:ty, $ptype:tt) => { + impl NativePType for $T { + const PTYPE: PType = PType::$ptype; + + fn is_nan(self) -> bool { + <$T>::is_nan(self) + } } }; } @@ -64,9 +82,9 @@ native_ptype!(i8, I8); native_ptype!(i16, I16); native_ptype!(i32, I32); native_ptype!(i64, I64); -native_ptype!(f16, F16); -native_ptype!(f32, F32); -native_ptype!(f64, F64); +native_float_ptype!(f16, F16); +native_float_ptype!(f32, F32); +native_float_ptype!(f64, F64); #[macro_export] macro_rules! match_each_native_ptype {