Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support of fixedsizebinary, duration, interval support in arrow #374

Merged
merged 4 commits into from
Aug 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 119 additions & 11 deletions crates/duckdb/src/vtab/arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@ use super::{BindInfo, DataChunkHandle, Free, FunctionInfo, InitInfo, LogicalType
use std::ptr::null_mut;

use crate::core::{ArrayVector, FlatVector, Inserter, ListVector, StructVector, Vector};
use arrow::array::{
as_boolean_array, as_generic_binary_array, as_large_list_array, as_list_array, as_primitive_array, as_string_array,
as_struct_array, Array, ArrayData, AsArray, BinaryArray, BooleanArray, Decimal128Array, FixedSizeListArray,
GenericListArray, GenericStringArray, LargeStringArray, OffsetSizeTrait, PrimitiveArray, StructArray,
use arrow::{
array::{
as_boolean_array, as_generic_binary_array, as_large_list_array, as_list_array, as_primitive_array,
as_string_array, as_struct_array, Array, ArrayData, AsArray, BinaryArray, BooleanArray, Decimal128Array,
FixedSizeBinaryArray, FixedSizeListArray, GenericListArray, GenericStringArray, IntervalMonthDayNanoArray,
LargeBinaryArray, LargeStringArray, OffsetSizeTrait, PrimitiveArray, StructArray,
},
compute::cast,
};

use arrow::{
Expand Down Expand Up @@ -194,9 +198,12 @@ pub fn to_duckdb_logical_type(data_type: &DataType) -> Result<LogicalTypeHandle,
// DuckDB does not support negative decimal scales
Ok(LogicalTypeHandle::decimal(*width, (*scale).try_into().unwrap()))
}
DataType::Boolean | DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => {
Ok(LogicalTypeHandle::from(to_duckdb_type_id(data_type)?))
}
DataType::Boolean
| DataType::Utf8
| DataType::LargeUtf8
| DataType::Binary
| DataType::LargeBinary
| DataType::FixedSizeBinary(_) => Ok(LogicalTypeHandle::from(to_duckdb_type_id(data_type)?)),
dtype if dtype.is_primitive() => Ok(LogicalTypeHandle::from(to_duckdb_type_id(data_type)?)),
_ => Err(format!(
"Unsupported data type: {data_type}, please file an issue https://github.com/wangfenjin/duckdb-rs"
Expand Down Expand Up @@ -238,6 +245,18 @@ pub fn record_batch_to_duckdb_data_chunk(
DataType::Binary => {
binary_array_to_vector(as_generic_binary_array(col.as_ref()), &mut chunk.flat_vector(i));
}
DataType::FixedSizeBinary(_) => {
fixed_size_binary_array_to_vector(col.as_ref().as_fixed_size_binary(), &mut chunk.flat_vector(i));
}
DataType::LargeBinary => {
large_binary_array_to_vector(
col.as_ref()
.as_any()
.downcast_ref::<LargeBinaryArray>()
.ok_or_else(|| Box::<dyn std::error::Error>::from("Unable to downcast to LargeBinaryArray"))?,
&mut chunk.flat_vector(i),
);
}
DataType::List(_) => {
list_array_to_vector(as_list_array(col.as_ref()), &mut chunk.list_vector(i))?;
}
Expand Down Expand Up @@ -282,7 +301,7 @@ fn primitive_array_to_flat_vector_cast<T: ArrowPrimitiveType>(
array: &dyn Array,
out_vector: &mut dyn Vector,
) {
let array = arrow::compute::kernels::cast::cast(array, &data_type).unwrap();
let array = cast(array, &data_type).unwrap_or_else(|_| panic!("array is casted into {data_type}"));
let out_vector: &mut FlatVector = out_vector.as_mut_any().downcast_mut().unwrap();
out_vector.copy::<T::Native>(array.as_primitive::<T>().values());
if let Some(nulls) = array.nulls() {
Expand Down Expand Up @@ -366,7 +385,21 @@ fn primitive_array_to_vector(array: &dyn Array, out: &mut dyn Vector) -> Result<
*width,
);
}

DataType::Interval(_) | DataType::Duration(_) => {
let array = IntervalMonthDayNanoArray::from(
cast(array, &DataType::Interval(IntervalUnit::MonthDayNano))
.expect("array is casted into IntervalMonthDayNanoArray")
.as_primitive::<IntervalMonthDayNanoType>()
.values()
.iter()
.map(|a| IntervalMonthDayNanoType::make_value(a.months, a.days, a.nanoseconds / 1000))
.collect::<Vec<_>>(),
);
primitive_array_to_flat_vector::<IntervalMonthDayNanoType>(
as_primitive_array(&array),
out.as_mut_any().downcast_mut().unwrap(),
);
}
// DuckDB Only supports timetamp_tz in microsecond precision
DataType::Timestamp(_, Some(tz)) => primitive_array_to_flat_vector_cast::<TimestampMicrosecondType>(
DataType::Timestamp(TimeUnit::Microsecond, Some(tz.clone())),
Expand Down Expand Up @@ -478,6 +511,28 @@ fn binary_array_to_vector(array: &BinaryArray, out: &mut FlatVector) {
}
}

fn fixed_size_binary_array_to_vector(array: &FixedSizeBinaryArray, out: &mut FlatVector) {
assert!(array.len() <= out.capacity());

for i in 0..array.len() {
let s = array.value(i);
out.insert(i, s);
}
// Put this back once the other PR #
// set_nulls_in_flat_vector(array, out);
}

fn large_binary_array_to_vector(array: &LargeBinaryArray, out: &mut FlatVector) {
assert!(array.len() <= out.capacity());

for i in 0..array.len() {
let s = array.value(i);
out.insert(i, s);
}
// Put this back once the other PR #
// set_nulls_in_flat_vector(array, out);
}

fn list_array_to_vector<O: OffsetSizeTrait + AsPrimitive<usize>>(
array: &GenericListArray<O>,
out: &mut ListVector,
Expand Down Expand Up @@ -618,12 +673,16 @@ mod test {
use arrow::{
array::{
Array, ArrayRef, AsArray, BinaryArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array,
FixedSizeListArray, GenericByteArray, GenericListArray, Int32Array, LargeStringArray, ListArray,
DurationSecondArray, FixedSizeListArray, GenericByteArray, GenericListArray, Int32Array,
IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeStringArray, ListArray,
OffsetSizeTrait, PrimitiveArray, StringArray, StructArray, Time32SecondArray, Time64MicrosecondArray,
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray,
},
buffer::{OffsetBuffer, ScalarBuffer},
datatypes::{i256, ArrowPrimitiveType, ByteArrayType, DataType, Field, Fields, Schema},
datatypes::{
i256, ArrowPrimitiveType, ByteArrayType, DataType, DurationSecondType, Field, Fields, IntervalDayTimeType,
IntervalMonthDayNanoType, IntervalYearMonthType, Schema,
},
record_batch::RecordBatch,
};
use std::{error::Error, sync::Arc};
Expand Down Expand Up @@ -1002,6 +1061,55 @@ mod test {
Ok(())
}

#[test]
fn test_interval_roundtrip() -> Result<(), Box<dyn Error>> {
let array: PrimitiveArray<IntervalMonthDayNanoType> = IntervalMonthDayNanoArray::from(vec![
IntervalMonthDayNanoType::make_value(1, 1, 1000),
IntervalMonthDayNanoType::make_value(2, 2, 2000),
IntervalMonthDayNanoType::make_value(3, 3, 3000),
]);
check_rust_primitive_array_roundtrip(array.clone(), array)?;

let array: PrimitiveArray<IntervalYearMonthType> = IntervalYearMonthArray::from(vec![
IntervalYearMonthType::make_value(1, 10),
IntervalYearMonthType::make_value(2, 20),
IntervalYearMonthType::make_value(3, 30),
]);
let expected_array: PrimitiveArray<IntervalMonthDayNanoType> = IntervalMonthDayNanoArray::from(vec![
IntervalMonthDayNanoType::make_value(22, 0, 0),
IntervalMonthDayNanoType::make_value(44, 0, 0),
IntervalMonthDayNanoType::make_value(66, 0, 0),
]);
check_rust_primitive_array_roundtrip(array, expected_array)?;

let array: PrimitiveArray<IntervalDayTimeType> = IntervalDayTimeArray::from(vec![
IntervalDayTimeType::make_value(1, 1),
IntervalDayTimeType::make_value(2, 2),
IntervalDayTimeType::make_value(3, 3),
]);
let expected_array: PrimitiveArray<IntervalMonthDayNanoType> = IntervalMonthDayNanoArray::from(vec![
IntervalMonthDayNanoType::make_value(0, 1, 1_000_000),
IntervalMonthDayNanoType::make_value(0, 2, 2_000_000),
IntervalMonthDayNanoType::make_value(0, 3, 3_000_000),
]);
check_rust_primitive_array_roundtrip(array, expected_array)?;

Ok(())
}

#[test]
fn test_duration_roundtrip() -> Result<(), Box<dyn Error>> {
let array: PrimitiveArray<DurationSecondType> = DurationSecondArray::from(vec![1, 2, 3]);
let expected_array: PrimitiveArray<IntervalMonthDayNanoType> = IntervalMonthDayNanoArray::from(vec![
IntervalMonthDayNanoType::make_value(0, 0, 1_000_000_000),
IntervalMonthDayNanoType::make_value(0, 0, 2_000_000_000),
IntervalMonthDayNanoType::make_value(0, 0, 3_000_000_000),
]);
check_rust_primitive_array_roundtrip(array, expected_array)?;

Ok(())
}

#[test]
fn test_timestamp_tz_insert() -> Result<(), Box<dyn Error>> {
// TODO: This test should be reworked once we support TIMESTAMP_TZ properly
Expand Down
Loading