From 4ab530ea29d8c404d517ae5bcf5ea7b7f82852c5 Mon Sep 17 00:00:00 2001 From: kikkon Date: Sun, 5 May 2024 23:39:50 +0800 Subject: [PATCH 01/12] Add `GenericListViewArray` for `ListView` & `LargeListView` --- arrow-array/src/array/list_view_array.rs | 1003 ++++++++++++++++++++++ arrow-array/src/array/mod.rs | 12 + arrow-array/src/iterator.rs | 5 +- 3 files changed, 1018 insertions(+), 2 deletions(-) create mode 100644 arrow-array/src/array/list_view_array.rs diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs new file mode 100644 index 00000000000..b9a9fa2b580 --- /dev/null +++ b/arrow-array/src/array/list_view_array.rs @@ -0,0 +1,1003 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow_buffer::{NullBuffer, ScalarBuffer}; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType, FieldRef}; + +use crate::array::{make_array, print_long_array}; +use crate::iterator::GenericListViewArrayIter; +use crate::{ + new_empty_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, FixedSizeListArray, + OffsetSizeTrait, +}; + +/// A [`GenericListViewArray`] of variable size lists, storing offsets as `i32`. +/// +// See [`ListViewBuilder`](crate::builder::ListViewBuilder) for how to construct a [`ListViewArray`] +pub type ListViewArray = GenericListViewArray; + +/// A [`GenericListViewArray`] of variable size lists, storing offsets as `i64`. +/// +// See [`LargeListViewArray`](crate::builder::LargeListViewBuilder) for how to construct a [`LargeListViewArray`] +pub type LargeListViewArray = GenericListViewArray; + +/// +/// Different from [`crate::GenericListArray`] as it stores both an offset and length +/// meaning that take / filter operations can be implemented without copying the underlying data. +/// +/// [Variable-size List Layout: ListView Layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout +pub struct GenericListViewArray { + data_type: DataType, + nulls: Option, + values: ArrayRef, + value_offsets: ScalarBuffer, + value_sizes: ScalarBuffer, +} + +impl Clone for GenericListViewArray { + fn clone(&self) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.clone(), + values: self.values.clone(), + value_offsets: self.value_offsets.clone(), + value_sizes: self.value_sizes.clone(), + } + } +} + +impl GenericListViewArray { + /// The data type constructor of listview array. + /// The input is the schema of the child array and + /// the output is the [`DataType`], ListView or LargeListView. + pub const DATA_TYPE_CONSTRUCTOR: fn(FieldRef) -> DataType = if OffsetSize::IS_LARGE { + DataType::LargeListView + } else { + DataType::ListView + }; + + /// Create a new [`GenericListViewArray`] from the provided parts + /// + /// # Errors + /// + /// Errors if + /// + /// * `offsets.len() != sizes.len()` + /// * `offsets.len() != nulls.len()` + /// * `offsets.last() > values.len()` + /// * `!field.is_nullable() && values.is_nullable()` + /// * `field.data_type() != values.data_type()` + /// * `0 <= offsets[i] <= length of the child array` + /// * `0 <= offsets[i] + size[i] <= length of the child array` + pub fn try_new( + field: FieldRef, + offsets: ScalarBuffer, + sizes: ScalarBuffer, + values: ArrayRef, + nulls: Option, + ) -> Result { + let len = offsets.len(); + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for {}ListViewArray, expected {len} got {}", + OffsetSize::PREFIX, + n.len(), + ))); + } + } + if len != sizes.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {} and {}", + OffsetSize::PREFIX, len, sizes.len() + ))); + } + + for i in 0..offsets.len() { + let length = values.len(); + let offset = offsets[i].as_usize(); + let size = sizes[i].as_usize(); + if offset > length { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid offset value for {}ListViewArray, offset: {}, length: {}", + OffsetSize::PREFIX, + offset, + length + ))); + } + if offset.saturating_add(size) > values.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid offset and size values for {}ListViewArray, offset: {}, size: {}, values.len(): {}", + OffsetSize::PREFIX, offset, size, values.len() + ))); + } + } + + if !field.is_nullable() && values.is_nullable() { + return Err(ArrowError::InvalidArgumentError(format!( + "Non-nullable field of {}ListViewArray {:?} cannot contain nulls", + OffsetSize::PREFIX, + field.name() + ))); + } + + if field.data_type() != values.data_type() { + return Err(ArrowError::InvalidArgumentError(format!( + "{}ListViewArray expected data type {} got {} for {:?}", + OffsetSize::PREFIX, + field.data_type(), + values.data_type(), + field.name() + ))); + } + + Ok(Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls, + values, + value_offsets: offsets, + value_sizes: sizes, + }) + } + + /// Create a new [`GenericListViewArray`] from the provided parts + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new( + field: FieldRef, + offsets: ScalarBuffer, + sizes: ScalarBuffer, + values: ArrayRef, + nulls: Option, + ) -> Self { + Self::try_new(field, offsets, sizes, values, nulls).unwrap() + } + + /// Create a new [`GenericListViewArray`] of length `len` where all values are null + pub fn new_null(field: FieldRef, len: usize) -> Self { + let values = new_empty_array(field.data_type()); + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls: Some(NullBuffer::new_null(len)), + value_offsets: ScalarBuffer::from(vec![]), + value_sizes: ScalarBuffer::from(vec![]), + values, + } + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts( + self, + ) -> ( + FieldRef, + ScalarBuffer, + ScalarBuffer, + ArrayRef, + Option, + ) { + let f = match self.data_type { + DataType::ListView(f) | DataType::LargeListView(f) => f, + _ => unreachable!(), + }; + ( + f, + self.value_offsets, + self.value_sizes, + self.values, + self.nulls, + ) + } + + /// Returns a reference to the offsets of this list + /// + /// Unlike [`Self::value_offsets`] this returns the [`ScalarBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn offsets(&self) -> &ScalarBuffer { + &self.value_offsets + } + + /// Returns a reference to the values of this list + #[inline] + pub fn values(&self) -> &ArrayRef { + &self.values + } + + /// Returns a reference to the sizes of this list + /// + /// Unlike [`Self::value_sizes`] this returns the [`ScalarBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn sizes(&self) -> &ScalarBuffer { + &self.value_sizes + } + + /// Returns a clone of the value type of this list. + pub fn value_type(&self) -> DataType { + self.values.data_type().clone() + } + + /// Returns ith value of this list view array. + /// # Safety + /// Caller must ensure that the index is within the array bounds + pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { + let offset = self.value_offsets()[i].as_usize(); + let length = self.value_sizes()[i].as_usize(); + self.values.slice(offset, length) + } + + /// Returns ith value of this list view array. + pub fn value(&self, i: usize) -> ArrayRef { + let offset = self.value_offsets()[i].as_usize(); + let length = self.value_sizes()[i].as_usize(); + self.values.slice(offset, length) + } + + /// Returns the offset values in the offsets buffer + #[inline] + pub fn value_offsets(&self) -> &[OffsetSize] { + &self.value_offsets + } + + /// Returns the sizes values in the offsets buffer + #[inline] + pub fn value_sizes(&self) -> &[OffsetSize] { + &self.value_sizes + } + + /// Returns the length for value at index `i`. + #[inline] + pub fn value_length(&self, i: usize) -> OffsetSize { + self.value_sizes[i] + } + + /// constructs a new iterator + pub fn iter<'a>(&'a self) -> GenericListViewArrayIter<'a, OffsetSize> { + GenericListViewArrayIter::<'a, OffsetSize>::new(self) + } + + #[inline] + fn get_type(data_type: &DataType) -> Option<&DataType> { + match (OffsetSize::IS_LARGE, data_type) { + (true, DataType::LargeListView(child)) | (false, DataType::ListView(child)) => { + Some(child.data_type()) + } + _ => None, + } + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + values: self.values.clone(), + value_offsets: self.value_offsets.slice(offset, length), + value_sizes: self.value_sizes.slice(offset, length), + } + } + + /// Creates a [`GenericListViewArray`] from an iterator of primitive values + /// + pub fn from_iter_primitive(_iter: I) -> Self + where + T: ArrowPrimitiveType, + P: IntoIterator::Native>>, + I: IntoIterator>, + { + panic!("Not yet implemented, wait list view array builder to be implemented") + } +} + +impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListViewArray { + type Item = ArrayRef; + + fn value(&self, index: usize) -> Self::Item { + GenericListViewArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericListViewArray::value(self, index) + } +} + +impl Array for GenericListViewArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn to_data(&self) -> ArrayData { + self.clone().into() + } + + fn into_data(self) -> ArrayData { + self.into() + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + Arc::new(self.slice(offset, length)) + } + + fn len(&self) -> usize { + self.sizes().len() + } + + fn is_empty(&self) -> bool { + self.values.is_empty() + } + + fn offset(&self) -> usize { + 0 + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.values.get_buffer_memory_size(); + size += self.value_offsets.inner().capacity(); + size += self.value_sizes.inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + let mut size = std::mem::size_of::() + self.values.get_array_memory_size(); + size += self.value_offsets.inner().capacity(); + size += self.value_sizes.inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } +} + +impl std::fmt::Debug for GenericListViewArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let prefix = OffsetSize::PREFIX; + write!(f, "{prefix}ListViewArray\n[\n")?; + print_long_array(self, f, |array, index, f| { + std::fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl From> for ArrayData { + fn from(array: GenericListViewArray) -> Self { + let len = array.len(); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .nulls(array.nulls) + .buffers(vec![ + array.value_offsets.into_inner(), + array.value_sizes.into_inner(), + ]) + .child_data(vec![array.values.to_data()]); + + unsafe { builder.build_unchecked() } + } +} + +impl From for GenericListViewArray { + fn from(data: ArrayData) -> Self { + Self::try_new_from_array_data(data) + .expect("Expected infallible creation of GenericListViewArray from ArrayDataRef failed") + } +} + +impl From for GenericListViewArray { + fn from(value: FixedSizeListArray) -> Self { + let (field, size) = match value.data_type() { + DataType::FixedSizeList(f, size) => (f, *size as usize), + _ => unreachable!(), + }; + let iter = std::iter::repeat(size).take(value.len()); + let mut offsets = Vec::with_capacity(iter.size_hint().0); + offsets.push(OffsetSize::usize_as(0)); + let mut acc = 0_usize; + let iter = std::iter::repeat(size).take(value.len()); + let mut sizes = Vec::with_capacity(iter.size_hint().0); + for size in iter { + acc = acc.checked_add(size).expect("usize overflow"); + offsets.push(OffsetSize::usize_as(acc)); + sizes.push(OffsetSize::usize_as(size)); + } + OffsetSize::from_usize(acc).expect("offset overflow"); + let sizes = ScalarBuffer::from(sizes); + let offsets = ScalarBuffer::from(offsets); + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), + nulls: value.nulls().cloned(), + values: value.values().clone(), + value_offsets: offsets, + value_sizes: sizes, + } + } +} + +impl GenericListViewArray { + fn try_new_from_array_data(data: ArrayData) -> Result { + if data.buffers().len() != 2 { + return Err(ArrowError::InvalidArgumentError(format!( + "ListViewArray data should contain two buffer (value offsets & value size), had {}", + data.buffers().len() + ))); + } + + if data.child_data().len() != 1 { + return Err(ArrowError::InvalidArgumentError(format!( + "ListViewArray should contain a single child array (values array), had {}", + data.child_data().len() + ))); + } + + let values = data.child_data()[0].clone(); + + if let Some(child_data_type) = Self::get_type(data.data_type()) { + if values.data_type() != child_data_type { + return Err(ArrowError::InvalidArgumentError(format!( + "[Large]ListViewArray's child datatype {:?} does not \ + correspond to the List's datatype {:?}", + values.data_type(), + child_data_type + ))); + } + } else { + return Err(ArrowError::InvalidArgumentError(format!( + "[Large]ListViewArray's datatype must be [Large]ListViewArray(). It is {:?}", + data.data_type() + ))); + } + + let values = make_array(values); + // SAFETY: + // ArrayData is valid, and verified type above + let value_offsets = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + let value_sizes = ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()); + + Ok(Self { + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), + values, + value_offsets, + value_sizes, + }) + } +} + +#[cfg(test)] +mod tests { + use arrow_buffer::{bit_util, BooleanBuffer, Buffer, ScalarBuffer}; + use arrow_schema::Field; + + use crate::builder::{FixedSizeListBuilder, Int32Builder}; + use crate::cast::AsArray; + use crate::types::Int32Type; + use crate::{Int32Array, Int64Array}; + + use super::*; + + #[test] + fn test_empty_list_view_array() { + // Construct an empty value array + let vec: Vec = vec![]; + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![]); + let offsets = ScalarBuffer::from(vec![]); + let values = Int32Array::from(vec); + let list_array = LargeListViewArray::new(field, offsets, sizes, Arc::new(values), None); + + assert_eq!(list_array.len(), 0) + } + + #[test] + fn test_list_view_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![3i32, 3, 2]); + let offsets = ScalarBuffer::from(vec![0i32, 3, 6]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); + let list_array = ListViewArray::new(field, offsets, sizes, Arc::new(values), None); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_sizes()[2]); + assert_eq!(2, list_array.value_length(2)); + assert_eq!( + 0, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_large_list_view_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![3i64, 3, 2]); + let offsets = ScalarBuffer::from(vec![0i64, 3, 6]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); + let list_array = LargeListViewArray::new(field, offsets, sizes, Arc::new(values), None); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_sizes()[2]); + assert_eq!(2, list_array.value_length(2)); + assert_eq!( + 0, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_list_view_array_slice() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + let buffer = BooleanBuffer::new(Buffer::from(null_bits), 0, 9); + let null_buffer = NullBuffer::new(buffer); + + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![2, 0, 0, 2, 2, 0, 3, 0, 1]); + let offsets = ScalarBuffer::from(vec![0, 2, 2, 2, 4, 6, 6, 9, 9]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + let list_array = + ListViewArray::new(field, offsets, sizes, Arc::new(values), Some(null_buffer)); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(9, list_array.len()); + assert_eq!(4, list_array.null_count()); + assert_eq!(2, list_array.value_offsets()[3]); + assert_eq!(2, list_array.value_sizes()[3]); + assert_eq!(2, list_array.value_length(3)); + + let sliced_array = list_array.slice(1, 6); + assert_eq!(6, sliced_array.len()); + assert_eq!(3, sliced_array.null_count()); + + for i in 0..sliced_array.len() { + if bit_util::get_bit(&null_bits, 1 + i) { + assert!(sliced_array.is_valid(i)); + } else { + assert!(sliced_array.is_null(i)); + } + } + + // Check offset and length for each non-null value. + let sliced_list_array = sliced_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(2, sliced_list_array.value_offsets()[2]); + assert_eq!(2, sliced_list_array.value_sizes()[2]); + assert_eq!(2, sliced_list_array.value_length(2)); + + assert_eq!(4, sliced_list_array.value_offsets()[3]); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + assert_eq!(2, sliced_list_array.value_length(3)); + + assert_eq!(6, sliced_list_array.value_offsets()[5]); + assert_eq!(3, sliced_list_array.value_sizes()[5]); + assert_eq!(3, sliced_list_array.value_length(5)); + } + + #[test] + fn test_large_list_view_array_slice() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + let buffer = BooleanBuffer::new(Buffer::from(null_bits), 0, 9); + let null_buffer = NullBuffer::new(buffer); + + // Construct a large list view array from the above two + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![2i64, 0, 0, 2, 2, 0, 3, 0, 1]); + let offsets = ScalarBuffer::from(vec![0i64, 2, 2, 2, 4, 6, 6, 9, 9]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + let list_array = + LargeListViewArray::new(field, offsets, sizes, Arc::new(values), Some(null_buffer)); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(9, list_array.len()); + assert_eq!(4, list_array.null_count()); + assert_eq!(2, list_array.value_offsets()[3]); + assert_eq!(2, list_array.value_sizes()[3]); + assert_eq!(2, list_array.value_length(3)); + + let sliced_array = list_array.slice(1, 6); + assert_eq!(6, sliced_array.len()); + assert_eq!(3, sliced_array.null_count()); + + for i in 0..sliced_array.len() { + if bit_util::get_bit(&null_bits, 1 + i) { + assert!(sliced_array.is_valid(i)); + } else { + assert!(sliced_array.is_null(i)); + } + } + + // Check offset and length for each non-null value. + let sliced_list_array = sliced_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(2, sliced_list_array.value_offsets()[2]); + assert_eq!(2, sliced_list_array.value_length(2)); + assert_eq!(2, sliced_list_array.value_sizes()[2]); + + assert_eq!(4, sliced_list_array.value_offsets()[3]); + assert_eq!(2, sliced_list_array.value_length(3)); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + + assert_eq!(6, sliced_list_array.value_offsets()[5]); + assert_eq!(3, sliced_list_array.value_length(5)); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + } + + #[test] + #[should_panic(expected = "index out of bounds: the len is 9 but the index is 10")] + fn test_list_view_array_index_out_of_bound() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + let buffer = BooleanBuffer::new(Buffer::from(null_bits), 0, 9); + let null_buffer = NullBuffer::new(buffer); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + // Construct a list array from the above two + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![2i32, 0, 0, 2, 2, 0, 3, 0, 1]); + let offsets = ScalarBuffer::from(vec![0i32, 2, 2, 2, 4, 6, 6, 9, 9]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + let list_array = + ListViewArray::new(field, offsets, sizes, Arc::new(values), Some(null_buffer)); + + assert_eq!(9, list_array.len()); + list_array.value(10); + } + #[test] + #[should_panic( + expected = "ListViewArray data should contain two buffer (value offsets & value size), had 0" + )] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_invalid_buffer_len() { + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build_unchecked() + }; + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_child_data(value_data) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + #[should_panic( + expected = "ListViewArray data should contain two buffer (value offsets & value size), had 1" + )] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_invalid_child_array_len() { + let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + fn test_list_view_array_offsets_need_not_start_at_zero() { + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![0i32, 0, 3]); + let offsets = ScalarBuffer::from(vec![2i32, 2, 5]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); + let list_array = ListViewArray::new(field, offsets, sizes, Arc::new(values), None); + + assert_eq!(list_array.value_length(0), 0); + assert_eq!(list_array.value_length(1), 0); + assert_eq!(list_array.value_length(2), 3); + } + + #[test] + #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_alignment() { + let offset_buf = Buffer::from_slice_ref([0_u64]); + let offset_buf2 = offset_buf.slice(1); + + let size_buf = Buffer::from_slice_ref([0_u64]); + let size_buf2 = size_buf.slice(1); + + let values: [i32; 8] = [0; 8]; + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .add_buffer(Buffer::from_slice_ref(values)) + .build_unchecked() + }; + + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .add_buffer(offset_buf2) + .add_buffer(size_buf2) + .add_child_data(value_data) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + fn test_empty_offsets() { + let f = Arc::new(Field::new("element", DataType::Int32, true)); + let string = ListViewArray::from( + ArrayData::builder(DataType::ListView(f.clone())) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[]); + assert_eq!(string.value_sizes(), &[]); + + let string = LargeListViewArray::from( + ArrayData::builder(DataType::LargeListView(f)) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[]); + assert_eq!(string.value_sizes(), &[]); + } + + #[test] + fn test_try_new() { + let offsets = ScalarBuffer::from(vec![0, 1, 4, 5]); + let sizes = ScalarBuffer::from(vec![1, 3, 1, 0]); + let values = Int32Array::new(vec![1, 2, 3, 4, 5].into(), None); + let values = Arc::new(values) as ArrayRef; + + let field = Arc::new(Field::new("element", DataType::Int32, false)); + ListViewArray::new( + field.clone(), + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ); + + let nulls = NullBuffer::new_null(4); + ListViewArray::new( + field.clone(), + offsets, + sizes.clone(), + values.clone(), + Some(nulls), + ); + + let nulls = NullBuffer::new_null(4); + let offsets = ScalarBuffer::from(vec![0, 1, 2, 3, 4]); + let sizes = ScalarBuffer::from(vec![1, 1, 1, 1, 0]); + let err = LargeListViewArray::try_new( + field, + offsets.clone(), + sizes.clone(), + values.clone(), + Some(nulls), + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Incorrect length of null buffer for LargeListViewArray, expected 5 got 4" + ); + + let field = Arc::new(Field::new("element", DataType::Int64, false)); + let err = LargeListViewArray::try_new( + field.clone(), + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: LargeListViewArray expected data type Int64 got Int32 for \"element\"" + ); + + let nulls = NullBuffer::new_null(7); + let values = Int64Array::new(vec![0; 7].into(), Some(nulls)); + let values = Arc::new(values); + + let err = LargeListViewArray::try_new( + field, + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Non-nullable field of LargeListViewArray \"element\" cannot contain nulls" + ); + } + + #[test] + fn test_from_fixed_size_list() { + let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 3); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[0, 0, 0]); + builder.append(false); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + let list: ListViewArray = builder.finish().into(); + let values: Vec<_> = list + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!(values, vec![Some(vec![1, 2, 3]), None, Some(vec![4, 5, 6])]) + } + + #[test] + #[should_panic( + expected = "Not yet implemented, wait list view array builder to be implemented" + )] + fn test_from_iter_primitive() { + let data = vec![ + Some(vec![Some(0), Some(1), Some(2)]), + Some(vec![Some(3), Some(4), Some(5)]), + Some(vec![Some(6), Some(7)]), + ]; + let list_array = ListViewArray::from_iter_primitive::(data); + + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![3, 3, 2]); + let offsets = ScalarBuffer::from(vec![0, 3, 6]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); + let another = ListViewArray::new(field, offsets, sizes, Arc::new(values), None); + assert_eq!(list_array, another) + } +} diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index b115ff9c14c..d4d1f690f44 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -72,6 +72,10 @@ mod byte_view_array; pub use byte_view_array::*; +mod list_view_array; + +pub use list_view_array::*; + /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the array as [`Any`] so that it can be @@ -519,6 +523,12 @@ impl PartialEq for GenericListArray { } } +impl PartialEq for GenericListViewArray { + fn eq(&self, other: &Self) -> bool { + self.to_data().eq(&other.to_data()) + } +} + impl PartialEq for MapArray { fn eq(&self, other: &Self) -> bool { self.to_data().eq(&other.to_data()) @@ -607,6 +617,8 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::Utf8View => Arc::new(StringViewArray::from(data)) as ArrayRef, DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef, DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as ArrayRef, + DataType::ListView(_) => Arc::new(ListViewArray::from(data)) as ArrayRef, + DataType::LargeListView(_) => Arc::new(LargeListViewArray::from(data)) as ArrayRef, DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef, DataType::Map(_, _) => Arc::new(MapArray::from(data)) as ArrayRef, DataType::Union(_, _) => Arc::new(UnionArray::from(data)) as ArrayRef, diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index 3f9cc0d525c..6708da3d5dd 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -21,7 +21,7 @@ use crate::array::{ ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, GenericStringArray, PrimitiveArray, }; -use crate::{FixedSizeListArray, MapArray}; +use crate::{FixedSizeListArray, GenericListViewArray, MapArray}; use arrow_buffer::NullBuffer; /// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] @@ -143,7 +143,8 @@ pub type FixedSizeListIter<'a> = ArrayIter<&'a FixedSizeListArray>; pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; /// an iterator that returns Some(T) or None, that can be used on any MapArray pub type MapArrayIter<'a> = ArrayIter<&'a MapArray>; - +/// an iterator that returns Some(T) or None, that can be used on any ListArray +pub type GenericListViewArrayIter<'a, O> = ArrayIter<&'a GenericListViewArray>; #[cfg(test)] mod tests { use std::sync::Arc; From fbfa059107bae3a1df7a920667b82260e12f7fa4 Mon Sep 17 00:00:00 2001 From: kikkon Date: Sun, 5 May 2024 23:49:08 +0800 Subject: [PATCH 02/12] rebase master --- arrow-array/src/array/list_view_array.rs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index b9a9fa2b580..2e753575798 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -744,12 +744,6 @@ mod tests { #[test] #[should_panic(expected = "index out of bounds: the len is 9 but the index is 10")] fn test_list_view_array_index_out_of_bound() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build() - .unwrap(); // 01011001 00000001 let mut null_bits: [u8; 2] = [0; 2]; @@ -817,11 +811,6 @@ mod tests { #[test] fn test_list_view_array_offsets_need_not_start_at_zero() { - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) - .build() - .unwrap(); let field = Arc::new(Field::new("item", DataType::Int32, true)); let sizes = ScalarBuffer::from(vec![0i32, 0, 3]); From 862106e5446dfb5ef38c486f3941859f18c9d1d3 Mon Sep 17 00:00:00 2001 From: kikkon Date: Sun, 19 May 2024 00:00:30 +0800 Subject: [PATCH 03/12] fix lint --- arrow-array/src/array/list_view_array.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 2e753575798..4e01ec5d33b 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -744,7 +744,6 @@ mod tests { #[test] #[should_panic(expected = "index out of bounds: the len is 9 but the index is 10")] fn test_list_view_array_index_out_of_bound() { - // 01011001 00000001 let mut null_bits: [u8; 2] = [0; 2]; bit_util::set_bit(&mut null_bits, 0); @@ -811,7 +810,6 @@ mod tests { #[test] fn test_list_view_array_offsets_need_not_start_at_zero() { - let field = Arc::new(Field::new("item", DataType::Int32, true)); let sizes = ScalarBuffer::from(vec![0i32, 0, 3]); let offsets = ScalarBuffer::from(vec![2i32, 2, 5]); From 0913b306634b4f12ba83402a66eb3ed2793135d0 Mon Sep 17 00:00:00 2001 From: kikkon Date: Sun, 16 Jun 2024 14:25:50 +0800 Subject: [PATCH 04/12] remove rom_iter_primitive --- arrow-array/src/array/list_view_array.rs | 29 ------------------------ 1 file changed, 29 deletions(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 4e01ec5d33b..391e98d2ea4 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -297,16 +297,6 @@ impl GenericListViewArray { } } - /// Creates a [`GenericListViewArray`] from an iterator of primitive values - /// - pub fn from_iter_primitive(_iter: I) -> Self - where - T: ArrowPrimitiveType, - P: IntoIterator::Native>>, - I: IntoIterator>, - { - panic!("Not yet implemented, wait list view array builder to be implemented") - } } impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListViewArray { @@ -968,23 +958,4 @@ mod tests { assert_eq!(values, vec![Some(vec![1, 2, 3]), None, Some(vec![4, 5, 6])]) } - #[test] - #[should_panic( - expected = "Not yet implemented, wait list view array builder to be implemented" - )] - fn test_from_iter_primitive() { - let data = vec![ - Some(vec![Some(0), Some(1), Some(2)]), - Some(vec![Some(3), Some(4), Some(5)]), - Some(vec![Some(6), Some(7)]), - ]; - let list_array = ListViewArray::from_iter_primitive::(data); - - let field = Arc::new(Field::new("item", DataType::Int32, true)); - let sizes = ScalarBuffer::from(vec![3, 3, 2]); - let offsets = ScalarBuffer::from(vec![0, 3, 6]); - let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); - let another = ListViewArray::new(field, offsets, sizes, Arc::new(values), None); - assert_eq!(list_array, another) - } } From 562dcfb5f0828218e20e7a1662193e979e217ed3 Mon Sep 17 00:00:00 2001 From: kikkon Date: Sun, 16 Jun 2024 15:14:10 +0800 Subject: [PATCH 05/12] fix fixed array --- arrow-array/src/array/list_view_array.rs | 34 +++++++++++------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 391e98d2ea4..70a5fad813a 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -25,7 +25,7 @@ use arrow_schema::{ArrowError, DataType, FieldRef}; use crate::array::{make_array, print_long_array}; use crate::iterator::GenericListViewArrayIter; use crate::{ - new_empty_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, FixedSizeListArray, + new_empty_array, Array, ArrayAccessor, ArrayRef, FixedSizeListArray, OffsetSizeTrait, }; @@ -409,15 +409,13 @@ impl From for GenericListViewAr DataType::FixedSizeList(f, size) => (f, *size as usize), _ => unreachable!(), }; - let iter = std::iter::repeat(size).take(value.len()); - let mut offsets = Vec::with_capacity(iter.size_hint().0); - offsets.push(OffsetSize::usize_as(0)); + let mut offsets = Vec::with_capacity(size); let mut acc = 0_usize; let iter = std::iter::repeat(size).take(value.len()); let mut sizes = Vec::with_capacity(iter.size_hint().0); for size in iter { - acc = acc.checked_add(size).expect("usize overflow"); offsets.push(OffsetSize::usize_as(acc)); + acc = acc.checked_add(size).expect("usize overflow"); sizes.push(OffsetSize::usize_as(size)); } OffsetSize::from_usize(acc).expect("offset overflow"); @@ -535,17 +533,13 @@ mod tests { 0, list_array .value(0) - .as_any() - .downcast_ref::() - .unwrap() + .as_primitive::() .value(0) ); assert_eq!( 0, unsafe { list_array.value_unchecked(0) } - .as_any() - .downcast_ref::() - .unwrap() + .as_primitive::() .value(0) ); for i in 0..3 { @@ -581,17 +575,13 @@ mod tests { 0, list_array .value(0) - .as_any() - .downcast_ref::() - .unwrap() + .as_primitive::() .value(0) ); assert_eq!( 0, unsafe { list_array.value_unchecked(0) } - .as_any() - .downcast_ref::() - .unwrap() + .as_primitive::() .value(0) ); for i in 0..3 { @@ -955,7 +945,15 @@ mod tests { .iter() .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) .collect(); - assert_eq!(values, vec![Some(vec![1, 2, 3]), None, Some(vec![4, 5, 6])]) + assert_eq!(values, vec![Some(vec![1, 2, 3]), None, Some(vec![4, 5, 6])]); + let offsets = list.value_offsets(); + assert_eq!(offsets, &[0, 3, 6]); + let sizes = list.value_sizes(); + assert_eq!(sizes, &[3, 3, 3]); } + + + + } From 0a895b7d22bcaf9ba2bc48af86f293586fab025e Mon Sep 17 00:00:00 2001 From: kikkon Date: Sun, 16 Jun 2024 15:30:09 +0800 Subject: [PATCH 06/12] lint --- arrow-array/src/array/list_view_array.rs | 27 +++--------------------- 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 70a5fad813a..caff296a603 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -24,10 +24,7 @@ use arrow_schema::{ArrowError, DataType, FieldRef}; use crate::array::{make_array, print_long_array}; use crate::iterator::GenericListViewArrayIter; -use crate::{ - new_empty_array, Array, ArrayAccessor, ArrayRef, FixedSizeListArray, - OffsetSizeTrait, -}; +use crate::{new_empty_array, Array, ArrayAccessor, ArrayRef, FixedSizeListArray, OffsetSizeTrait}; /// A [`GenericListViewArray`] of variable size lists, storing offsets as `i32`. /// @@ -296,7 +293,6 @@ impl GenericListViewArray { value_sizes: self.value_sizes.slice(offset, length), } } - } impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListViewArray { @@ -529,13 +525,7 @@ mod tests { assert_eq!(6, list_array.value_offsets()[2]); assert_eq!(2, list_array.value_sizes()[2]); assert_eq!(2, list_array.value_length(2)); - assert_eq!( - 0, - list_array - .value(0) - .as_primitive::() - .value(0) - ); + assert_eq!(0, list_array.value(0).as_primitive::().value(0)); assert_eq!( 0, unsafe { list_array.value_unchecked(0) } @@ -571,13 +561,7 @@ mod tests { assert_eq!(6, list_array.value_offsets()[2]); assert_eq!(2, list_array.value_sizes()[2]); assert_eq!(2, list_array.value_length(2)); - assert_eq!( - 0, - list_array - .value(0) - .as_primitive::() - .value(0) - ); + assert_eq!(0, list_array.value(0).as_primitive::().value(0)); assert_eq!( 0, unsafe { list_array.value_unchecked(0) } @@ -951,9 +935,4 @@ mod tests { let sizes = list.value_sizes(); assert_eq!(sizes, &[3, 3, 3]); } - - - - - } From 708ce90b451ca33185be8b77ad607a1208d131de Mon Sep 17 00:00:00 2001 From: Kikkon <19528375+Kikkon@users.noreply.github.com> Date: Wed, 31 Jul 2024 21:10:34 +0800 Subject: [PATCH 07/12] Update arrow-array/src/array/list_view_array.rs Co-authored-by: Weston Pace --- arrow-array/src/array/list_view_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index caff296a603..d115cfddfc0 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -33,7 +33,7 @@ pub type ListViewArray = GenericListViewArray; /// A [`GenericListViewArray`] of variable size lists, storing offsets as `i64`. /// -// See [`LargeListViewArray`](crate::builder::LargeListViewBuilder) for how to construct a [`LargeListViewArray`] +// See [`LargeListViewBuilder`](crate::builder::LargeListViewBuilder) for how to construct a [`LargeListViewArray`] pub type LargeListViewArray = GenericListViewArray; /// From 5dc9b589d01d415a09d356fe1194bc70faff4255 Mon Sep 17 00:00:00 2001 From: Kikkon <19528375+Kikkon@users.noreply.github.com> Date: Wed, 31 Jul 2024 21:10:49 +0800 Subject: [PATCH 08/12] Update arrow-array/src/array/list_view_array.rs Co-authored-by: Weston Pace --- arrow-array/src/array/list_view_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index d115cfddfc0..4c64d4c8a97 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -431,7 +431,7 @@ impl GenericListViewArray { fn try_new_from_array_data(data: ArrayData) -> Result { if data.buffers().len() != 2 { return Err(ArrowError::InvalidArgumentError(format!( - "ListViewArray data should contain two buffer (value offsets & value size), had {}", + "ListViewArray data should contain two buffers (value offsets & value sizes), had {}", data.buffers().len() ))); } From 097c0b4374edbdf5a1ba58afafa39e018f1a3e4f Mon Sep 17 00:00:00 2001 From: kikkon Date: Wed, 31 Jul 2024 23:47:47 +0800 Subject: [PATCH 09/12] fix comment --- arrow-array/src/array/list_view_array.rs | 179 ++++++++++++++++------- 1 file changed, 130 insertions(+), 49 deletions(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 4c64d4c8a97..266f0256549 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -17,7 +17,6 @@ use std::any::Any; use std::sync::Arc; - use arrow_buffer::{NullBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, FieldRef}; @@ -41,6 +40,7 @@ pub type LargeListViewArray = GenericListViewArray; /// meaning that take / filter operations can be implemented without copying the underlying data. /// /// [Variable-size List Layout: ListView Layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout +#[derive(Clone)] pub struct GenericListViewArray { data_type: DataType, nulls: Option, @@ -49,17 +49,6 @@ pub struct GenericListViewArray { value_sizes: ScalarBuffer, } -impl Clone for GenericListViewArray { - fn clone(&self) -> Self { - Self { - data_type: self.data_type.clone(), - nulls: self.nulls.clone(), - values: self.values.clone(), - value_offsets: self.value_offsets.clone(), - value_sizes: self.value_sizes.clone(), - } - } -} impl GenericListViewArray { /// The data type constructor of listview array. @@ -79,7 +68,7 @@ impl GenericListViewArray { /// /// * `offsets.len() != sizes.len()` /// * `offsets.len() != nulls.len()` - /// * `offsets.last() > values.len()` + /// * `offsets[i] > values.len()` /// * `!field.is_nullable() && values.is_nullable()` /// * `field.data_type() != values.data_type()` /// * `0 <= offsets[i] <= length of the child array` @@ -108,18 +97,9 @@ impl GenericListViewArray { ))); } - for i in 0..offsets.len() { - let length = values.len(); - let offset = offsets[i].as_usize(); - let size = sizes[i].as_usize(); - if offset > length { - return Err(ArrowError::InvalidArgumentError(format!( - "Invalid offset value for {}ListViewArray, offset: {}, length: {}", - OffsetSize::PREFIX, - offset, - length - ))); - } + for (offset, size) in offsets.iter().zip(sizes.iter()) { + let offset = offset.as_usize(); + let size = size.as_usize(); if offset.saturating_add(size) > values.len() { return Err(ArrowError::InvalidArgumentError(format!( "Invalid offset and size values for {}ListViewArray, offset: {}, size: {}, values.len(): {}", @@ -238,8 +218,8 @@ impl GenericListViewArray { /// # Safety /// Caller must ensure that the index is within the array bounds pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { - let offset = self.value_offsets()[i].as_usize(); - let length = self.value_sizes()[i].as_usize(); + let offset = self.value_offsets().get_unchecked(i).as_usize(); + let length = self.value_sizes().get_unchecked(i).as_usize(); self.values.slice(offset, length) } @@ -262,12 +242,17 @@ impl GenericListViewArray { &self.value_sizes } - /// Returns the length for value at index `i`. + /// Returns the size for value at index `i`. #[inline] - pub fn value_length(&self, i: usize) -> OffsetSize { + pub fn value_size(&self, i: usize) -> OffsetSize { self.value_sizes[i] } + /// Returns the offset for value as index `i`. + pub fn value_offset(&self, i: usize) -> OffsetSize { + self.value_offsets[i] + } + /// constructs a new iterator pub fn iter<'a>(&'a self) -> GenericListViewArrayIter<'a, OffsetSize> { GenericListViewArrayIter::<'a, OffsetSize>::new(self) @@ -303,7 +288,7 @@ impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListViewArray } unsafe fn value_unchecked(&self, index: usize) -> Self::Item { - GenericListViewArray::value(self, index) + GenericListViewArray::value_unchecked(self, index) } } @@ -333,7 +318,7 @@ impl Array for GenericListViewArray { } fn is_empty(&self) -> bool { - self.values.is_empty() + self.value_sizes.is_empty() } fn offset(&self) -> usize { @@ -405,16 +390,15 @@ impl From for GenericListViewAr DataType::FixedSizeList(f, size) => (f, *size as usize), _ => unreachable!(), }; - let mut offsets = Vec::with_capacity(size); let mut acc = 0_usize; let iter = std::iter::repeat(size).take(value.len()); let mut sizes = Vec::with_capacity(iter.size_hint().0); + let mut offsets = Vec::with_capacity(iter.size_hint().0); for size in iter { offsets.push(OffsetSize::usize_as(acc)); - acc = acc.checked_add(size).expect("usize overflow"); + acc = acc.saturating_add(size); sizes.push(OffsetSize::usize_as(size)); } - OffsetSize::from_usize(acc).expect("offset overflow"); let sizes = ScalarBuffer::from(sizes); let offsets = ScalarBuffer::from(offsets); Self { @@ -462,7 +446,6 @@ impl GenericListViewArray { } let values = make_array(values); - // SAFETY: // ArrayData is valid, and verified type above let value_offsets = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); let value_sizes = ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()); @@ -524,7 +507,7 @@ mod tests { assert_eq!(0, list_array.null_count()); assert_eq!(6, list_array.value_offsets()[2]); assert_eq!(2, list_array.value_sizes()[2]); - assert_eq!(2, list_array.value_length(2)); + assert_eq!(2, list_array.value_size(2)); assert_eq!(0, list_array.value(0).as_primitive::().value(0)); assert_eq!( 0, @@ -560,7 +543,7 @@ mod tests { assert_eq!(0, list_array.null_count()); assert_eq!(6, list_array.value_offsets()[2]); assert_eq!(2, list_array.value_sizes()[2]); - assert_eq!(2, list_array.value_length(2)); + assert_eq!(2, list_array.value_size(2)); assert_eq!(0, list_array.value(0).as_primitive::().value(0)); assert_eq!( 0, @@ -607,7 +590,7 @@ mod tests { assert_eq!(4, list_array.null_count()); assert_eq!(2, list_array.value_offsets()[3]); assert_eq!(2, list_array.value_sizes()[3]); - assert_eq!(2, list_array.value_length(3)); + assert_eq!(2, list_array.value_size(3)); let sliced_array = list_array.slice(1, 6); assert_eq!(6, sliced_array.len()); @@ -628,15 +611,15 @@ mod tests { .unwrap(); assert_eq!(2, sliced_list_array.value_offsets()[2]); assert_eq!(2, sliced_list_array.value_sizes()[2]); - assert_eq!(2, sliced_list_array.value_length(2)); + assert_eq!(2, sliced_list_array.value_size(2)); assert_eq!(4, sliced_list_array.value_offsets()[3]); assert_eq!(2, sliced_list_array.value_sizes()[3]); - assert_eq!(2, sliced_list_array.value_length(3)); + assert_eq!(2, sliced_list_array.value_size(3)); assert_eq!(6, sliced_list_array.value_offsets()[5]); assert_eq!(3, sliced_list_array.value_sizes()[5]); - assert_eq!(3, sliced_list_array.value_length(5)); + assert_eq!(3, sliced_list_array.value_size(5)); } #[test] @@ -673,7 +656,7 @@ mod tests { assert_eq!(4, list_array.null_count()); assert_eq!(2, list_array.value_offsets()[3]); assert_eq!(2, list_array.value_sizes()[3]); - assert_eq!(2, list_array.value_length(3)); + assert_eq!(2, list_array.value_size(3)); let sliced_array = list_array.slice(1, 6); assert_eq!(6, sliced_array.len()); @@ -693,15 +676,15 @@ mod tests { .downcast_ref::() .unwrap(); assert_eq!(2, sliced_list_array.value_offsets()[2]); - assert_eq!(2, sliced_list_array.value_length(2)); + assert_eq!(2, sliced_list_array.value_size(2)); assert_eq!(2, sliced_list_array.value_sizes()[2]); assert_eq!(4, sliced_list_array.value_offsets()[3]); - assert_eq!(2, sliced_list_array.value_length(3)); + assert_eq!(2, sliced_list_array.value_size(3)); assert_eq!(2, sliced_list_array.value_sizes()[3]); assert_eq!(6, sliced_list_array.value_offsets()[5]); - assert_eq!(3, sliced_list_array.value_length(5)); + assert_eq!(3, sliced_list_array.value_size(5)); assert_eq!(2, sliced_list_array.value_sizes()[3]); } @@ -733,7 +716,7 @@ mod tests { } #[test] #[should_panic( - expected = "ListViewArray data should contain two buffer (value offsets & value size), had 0" + expected = "ListViewArray data should contain two buffers (value offsets & value sizes), had 0" )] #[cfg(not(feature = "force_validate"))] fn test_list_view_array_invalid_buffer_len() { @@ -780,9 +763,9 @@ mod tests { let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); let list_array = ListViewArray::new(field, offsets, sizes, Arc::new(values), None); - assert_eq!(list_array.value_length(0), 0); - assert_eq!(list_array.value_length(1), 0); - assert_eq!(list_array.value_length(2), 3); + assert_eq!(list_array.value_size(0), 0); + assert_eq!(list_array.value_size(1), 0); + assert_eq!(list_array.value_size(2), 3); } #[test] @@ -935,4 +918,102 @@ mod tests { let sizes = list.value_sizes(); assert_eq!(sizes, &[3, 3, 3]); } + + #[test] + fn test_list_view_array_overlap_lists() { + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build_unchecked() + }; + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(2) + .add_buffer(Buffer::from_slice_ref([0, 3])) // offsets + .add_buffer(Buffer::from_slice_ref([5, 5])) // sizes + .add_child_data(value_data) + .build_unchecked() + }; + let array = ListViewArray::from(list_data); + + assert_eq!(array.len(), 2); + assert_eq!(array.value_size(0), 5); + assert_eq!(array.value_size(1), 5); + + let values: Vec<_> = array + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!(values, vec![Some(vec![0,1, 2, 3,4]), Some(vec![3, 4, 5, 6,7])]); + + } + + #[test] + fn test_list_view_array_incomplete_offsets() { + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(50) + .add_buffer(Buffer::from_slice_ref((0..50).collect::>())) + .build_unchecked() + }; + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_buffer(Buffer::from_slice_ref([0, 5, 10])) // offsets + .add_buffer(Buffer::from_slice_ref([0, 5, 10])) // sizes + .add_child_data(value_data) + .build_unchecked() + }; + let array = ListViewArray::from(list_data); + + assert_eq!(array.len(), 3); + assert_eq!(array.value_size(0), 0); + assert_eq!(array.value_size(1), 5); + assert_eq!(array.value_size(2), 10); + + let values: Vec<_> = array + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!(values, vec![ + Some(vec![]), + Some(vec![5, 6, 7, 8, 9]), + Some(vec![10, 11, 12, 13, 14, 15, 16, 17, 18, 19]) + ]); + } + + #[test] + fn test_list_view_array_empty_lists() { + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(0) + .add_buffer(Buffer::from_slice_ref::(&[])) + .build_unchecked() + }; + let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_buffer(Buffer::from_slice_ref([0, 0, 0])) // offsets + .add_buffer(Buffer::from_slice_ref([0, 0, 0])) // sizes + .add_child_data(value_data) + .build_unchecked() + }; + let array = ListViewArray::from(list_data); + + assert_eq!(array.len(), 3); + assert_eq!(array.value_size(0), 0); + assert_eq!(array.value_size(1), 0); + assert_eq!(array.value_size(2), 0); + + let values: Vec<_> = array + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!(values, vec![Some(vec![]), Some(vec![]), Some(vec![])]); + } + } From 33a896f10ec6cb1f887ffed16cb7dccf5eab6060 Mon Sep 17 00:00:00 2001 From: kikkon Date: Thu, 1 Aug 2024 00:05:51 +0800 Subject: [PATCH 10/12] cargo fmt --- arrow-array/src/array/list_view_array.rs | 36 ++++++++++++++---------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 266f0256549..052a4f1f6f9 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; -use std::sync::Arc; use arrow_buffer::{NullBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, FieldRef}; +use std::any::Any; +use std::sync::Arc; use crate::array::{make_array, print_long_array}; use crate::iterator::GenericListViewArrayIter; @@ -49,7 +49,6 @@ pub struct GenericListViewArray { value_sizes: ScalarBuffer, } - impl GenericListViewArray { /// The data type constructor of listview array. /// The input is the schema of the child array and @@ -739,7 +738,7 @@ mod tests { #[test] #[should_panic( - expected = "ListViewArray data should contain two buffer (value offsets & value size), had 1" + expected = "ListViewArray data should contain two buffers (value offsets & value sizes), had 1" )] #[cfg(not(feature = "force_validate"))] fn test_list_view_array_invalid_child_array_len() { @@ -927,7 +926,8 @@ mod tests { .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build_unchecked() }; - let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(2) @@ -946,8 +946,10 @@ mod tests { .iter() .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) .collect(); - assert_eq!(values, vec![Some(vec![0,1, 2, 3,4]), Some(vec![3, 4, 5, 6,7])]); - + assert_eq!( + values, + vec![Some(vec![0, 1, 2, 3, 4]), Some(vec![3, 4, 5, 6, 7])] + ); } #[test] @@ -958,7 +960,8 @@ mod tests { .add_buffer(Buffer::from_slice_ref((0..50).collect::>())) .build_unchecked() }; - let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -978,11 +981,14 @@ mod tests { .iter() .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) .collect(); - assert_eq!(values, vec![ - Some(vec![]), - Some(vec![5, 6, 7, 8, 9]), - Some(vec![10, 11, 12, 13, 14, 15, 16, 17, 18, 19]) - ]); + assert_eq!( + values, + vec![ + Some(vec![]), + Some(vec![5, 6, 7, 8, 9]), + Some(vec![10, 11, 12, 13, 14, 15, 16, 17, 18, 19]) + ] + ); } #[test] @@ -993,7 +999,8 @@ mod tests { .add_buffer(Buffer::from_slice_ref::(&[])) .build_unchecked() }; - let list_data_type = DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -1015,5 +1022,4 @@ mod tests { .collect(); assert_eq!(values, vec![Some(vec![]), Some(vec![]), Some(vec![])]); } - } From 98fd023b642353430a0ba8f103530aed915d9970 Mon Sep 17 00:00:00 2001 From: kikkon Date: Sat, 21 Sep 2024 20:24:34 +0800 Subject: [PATCH 11/12] fix: check size --- arrow-array/src/array/list_view_array.rs | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 052a4f1f6f9..d9e6cb22e60 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -19,6 +19,7 @@ use arrow_buffer::{NullBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, FieldRef}; use std::any::Any; +use std::ops::Add; use std::sync::Arc; use crate::array::{make_array, print_long_array}; @@ -99,10 +100,19 @@ impl GenericListViewArray { for (offset, size) in offsets.iter().zip(sizes.iter()) { let offset = offset.as_usize(); let size = size.as_usize(); - if offset.saturating_add(size) > values.len() { + if offset.checked_add(size).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Overflow in offset + size for {}ListViewArray", + OffsetSize::PREFIX + )) + })? > values.len() + { return Err(ArrowError::InvalidArgumentError(format!( - "Invalid offset and size values for {}ListViewArray, offset: {}, size: {}, values.len(): {}", - OffsetSize::PREFIX, offset, size, values.len() + "Offset + size for {}ListViewArray must be within the bounds of the child array, got offset: {}, size: {}, child array length: {}", + OffsetSize::PREFIX, + offset, + size, + values.len() ))); } } @@ -393,9 +403,11 @@ impl From for GenericListViewAr let iter = std::iter::repeat(size).take(value.len()); let mut sizes = Vec::with_capacity(iter.size_hint().0); let mut offsets = Vec::with_capacity(iter.size_hint().0); + //check if the size is usize + for size in iter { offsets.push(OffsetSize::usize_as(acc)); - acc = acc.saturating_add(size); + acc = acc.add(size); sizes.push(OffsetSize::usize_as(size)); } let sizes = ScalarBuffer::from(sizes); From 65a646e9c4beff6dcd98566e8826ea110b30d80f Mon Sep 17 00:00:00 2001 From: Kikkon <19528375+Kikkon@users.noreply.github.com> Date: Tue, 24 Sep 2024 11:42:23 +0800 Subject: [PATCH 12/12] chore:remove unuse comment --- arrow-array/src/array/list_view_array.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index d9e6cb22e60..68802528a7e 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -403,7 +403,6 @@ impl From for GenericListViewAr let iter = std::iter::repeat(size).take(value.len()); let mut sizes = Vec::with_capacity(iter.size_hint().0); let mut offsets = Vec::with_capacity(iter.size_hint().0); - //check if the size is usize for size in iter { offsets.push(OffsetSize::usize_as(acc));