diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index eeec75f2f473d..238afb0328672 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/range.h" #include "arrow/util/small_vector.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" @@ -260,7 +262,7 @@ struct SchemaExporter { // Dictionary type: parent struct describes index type, // child dictionary struct describes value type. RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), this)); - dict_exporter_.reset(new SchemaExporter()); + dict_exporter_ = std::make_unique(); RETURN_NOT_OK(dict_exporter_->ExportType(*dict_type.value_type())); } else { RETURN_NOT_OK(VisitTypeInline(type, this)); @@ -357,10 +359,14 @@ struct SchemaExporter { Status Visit(const LargeBinaryType& type) { return SetFormat("Z"); } + Status Visit(const BinaryViewType& type) { return SetFormat("vz"); } + Status Visit(const StringType& type) { return SetFormat("u"); } Status Visit(const LargeStringType& type) { return SetFormat("U"); } + Status Visit(const StringViewType& type) { return SetFormat("vu"); } + Status Visit(const Date32Type& type) { return SetFormat("tdD"); } Status Visit(const Date64Type& type) { return SetFormat("tdm"); } @@ -521,13 +527,14 @@ namespace { struct ExportedArrayPrivateData : PoolAllocationMixin { // The buffers are owned by the ArrayData member - StaticVector buffers_; + SmallVector buffers_; struct ArrowArray dictionary_; SmallVector children_; SmallVector child_pointers_; std::shared_ptr data_; std::shared_ptr sync_; + std::vector variadic_buffer_sizes_; ExportedArrayPrivateData() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedArrayPrivateData); @@ -570,15 +577,32 @@ struct ArrayExporter { --n_buffers; ++buffers_begin; } + + bool need_variadic_buffer_sizes = + data->type->id() == Type::BINARY_VIEW || data->type->id() == Type::STRING_VIEW; + if (need_variadic_buffer_sizes) { + ++n_buffers; + } + export_.buffers_.resize(n_buffers); std::transform(buffers_begin, data->buffers.end(), export_.buffers_.begin(), [](const std::shared_ptr& buffer) -> const void* { return buffer ? buffer->data() : nullptr; }); + if (need_variadic_buffer_sizes) { + auto variadic_buffers = util::span(data->buffers).subspan(2); + export_.variadic_buffer_sizes_.resize(variadic_buffers.size()); + size_t i = 0; + for (const auto& buf : variadic_buffers) { + export_.variadic_buffer_sizes_[i++] = buf->size(); + } + export_.buffers_.back() = export_.variadic_buffer_sizes_.data(); + } + // Export dictionary if (data->dictionary != nullptr) { - dict_exporter_.reset(new ArrayExporter()); + dict_exporter_ = std::make_unique(); RETURN_NOT_OK(dict_exporter_->Export(data->dictionary)); } @@ -795,7 +819,7 @@ Status InvalidFormatString(std::string_view v) { class FormatStringParser { public: - FormatStringParser() {} + FormatStringParser() = default; explicit FormatStringParser(std::string_view v) : view_(v), index_(0) {} @@ -941,8 +965,6 @@ Result DecodeMetadata(const char* metadata) { } struct SchemaImporter { - SchemaImporter() : c_struct_(nullptr), guard_(nullptr) {} - Status Import(struct ArrowSchema* src) { if (ArrowSchemaIsReleased(src)) { return Status::Invalid("Cannot import released ArrowSchema"); @@ -1068,6 +1090,8 @@ struct SchemaImporter { return ProcessPrimitive(binary()); case 'Z': return ProcessPrimitive(large_binary()); + case 'v': + return ProcessBinaryView(); case 'w': return ProcessFixedSizeBinary(); case 'd': @@ -1080,6 +1104,17 @@ struct SchemaImporter { return f_parser_.Invalid(); } + Status ProcessBinaryView() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'z': + return ProcessPrimitive(binary_view()); + case 'u': + return ProcessPrimitive(utf8_view()); + } + return f_parser_.Invalid(); + } + Status ProcessTemporal() { RETURN_NOT_OK(f_parser_.CheckHasNext()); switch (f_parser_.Next()) { @@ -1360,8 +1395,8 @@ struct SchemaImporter { return Status::OK(); } - struct ArrowSchema* c_struct_; - SchemaExportGuard guard_; + struct ArrowSchema* c_struct_{nullptr}; + SchemaExportGuard guard_{nullptr}; FormatStringParser f_parser_; int64_t recursion_level_; std::vector child_importers_; @@ -1429,7 +1464,7 @@ class ImportedBuffer : public Buffer { std::shared_ptr import) : Buffer(data, size, mm, nullptr, device_type), import_(std::move(import)) {} - ~ImportedBuffer() override {} + ~ImportedBuffer() override = default; std::shared_ptr device_sync_event() override { return import_->device_sync_; @@ -1441,9 +1476,7 @@ class ImportedBuffer : public Buffer { struct ArrayImporter { explicit ArrayImporter(const std::shared_ptr& type) - : type_(type), - zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)), - device_type_(DeviceAllocationType::kCPU) {} + : type_(type), zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)) {} Status Import(struct ArrowDeviceArray* src, const DeviceMemoryMapper& mapper) { ARROW_ASSIGN_OR_RAISE(memory_mgr_, mapper(src->device_type, src->device_id)); @@ -1591,6 +1624,10 @@ struct ArrayImporter { Status Visit(const LargeBinaryType& type) { return ImportStringLike(type); } + Status Visit(const StringViewType& type) { return ImportBinaryView(type); } + + Status Visit(const BinaryViewType& type) { return ImportBinaryView(type); } + Status Visit(const ListType& type) { return ImportListLike(type); } Status Visit(const LargeListType& type) { return ImportListLike(type); } @@ -1673,6 +1710,28 @@ struct ArrayImporter { return Status::OK(); } + Status ImportBinaryView(const BinaryViewType&) { + RETURN_NOT_OK(CheckNoChildren()); + if (c_struct_->n_buffers < 3) { + return Status::Invalid("Expected at least 3 buffers for imported type ", + type_->ToString(), ", ArrowArray struct has ", + c_struct_->n_buffers); + } + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK(ImportFixedSizeBuffer(1, BinaryViewType::kSize)); + + // The last C data buffer stores buffer sizes, and shouldn't be imported + auto* buffer_sizes = + static_cast(c_struct_->buffers[c_struct_->n_buffers - 1]); + + for (int32_t buffer_id = 2; buffer_id < c_struct_->n_buffers - 1; ++buffer_id) { + RETURN_NOT_OK(ImportBuffer(buffer_id, buffer_sizes[buffer_id - 2])); + } + data_->buffers.pop_back(); + return Status::OK(); + } + template Status ImportStringLike(const StringType& type) { RETURN_NOT_OK(CheckNoChildren()); @@ -1836,7 +1895,7 @@ struct ArrayImporter { std::shared_ptr zero_size_buffer_; std::shared_ptr memory_mgr_; - DeviceAllocationType device_type_; + DeviceAllocationType device_type_{DeviceAllocationType::kCPU}; }; } // namespace @@ -2042,7 +2101,7 @@ class ArrayStreamBatchReader : public RecordBatchReader { DCHECK(!ArrowArrayStreamIsReleased(&stream_)); } - ~ArrayStreamBatchReader() { + ~ArrayStreamBatchReader() override { if (!ArrowArrayStreamIsReleased(&stream_)) { ArrowArrayStreamRelease(&stream_); } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 362df833781a1..326c67f5eceac 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -38,11 +38,13 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/testing/util.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/endian.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/range.h" // TODO(GH-37221): Remove these ifdef checks when compute dependency is removed #ifdef ARROW_COMPUTE @@ -58,6 +60,7 @@ using internal::ArrayStreamExportTraits; using internal::checked_cast; using internal::SchemaExportGuard; using internal::SchemaExportTraits; +using internal::Zip; template struct ExportTraits {}; @@ -91,7 +94,7 @@ class ReleaseCallback { public: using CType = typename Traits::CType; - explicit ReleaseCallback(CType* c_struct) : called_(false) { + explicit ReleaseCallback(CType* c_struct) { orig_release_ = c_struct->release; orig_private_data_ = c_struct->private_data; c_struct->release = StaticRelease; @@ -123,7 +126,7 @@ class ReleaseCallback { private: ARROW_DISALLOW_COPY_AND_ASSIGN(ReleaseCallback); - bool called_; + bool called_{false}; void (*orig_release_)(CType*); void* orig_private_data_; }; @@ -238,8 +241,7 @@ struct SchemaExportChecker { flattened_flags.empty() ? std::vector(flattened_formats_.size(), kDefaultFlags) : std::move(flattened_flags)), - flattened_metadata_(std::move(flattened_metadata)), - flattened_index_(0) {} + flattened_metadata_(std::move(flattened_metadata)) {} void operator()(struct ArrowSchema* c_export, bool inner = false) { ASSERT_LT(flattened_index_, flattened_formats_.size()); @@ -288,7 +290,7 @@ struct SchemaExportChecker { const std::vector flattened_names_; std::vector flattened_flags_; const std::vector flattened_metadata_; - size_t flattened_index_; + size_t flattened_index_{0}; }; class TestSchemaExport : public ::testing::Test { @@ -354,6 +356,8 @@ TEST_F(TestSchemaExport, Primitive) { TestPrimitive(large_binary(), "Z"); TestPrimitive(utf8(), "u"); TestPrimitive(large_utf8(), "U"); + TestPrimitive(binary_view(), "vz"); + TestPrimitive(utf8_view(), "vu"); TestPrimitive(decimal(16, 4), "d:16,4"); TestPrimitive(decimal256(16, 4), "d:16,4,256"); @@ -565,12 +569,24 @@ struct ArrayExportChecker { --expected_n_buffers; ++expected_buffers; } - ASSERT_EQ(c_export->n_buffers, expected_n_buffers); + bool has_variadic_buffer_sizes = expected_data.type->id() == Type::STRING_VIEW || + expected_data.type->id() == Type::BINARY_VIEW; + ASSERT_EQ(c_export->n_buffers, expected_n_buffers + has_variadic_buffer_sizes); ASSERT_NE(c_export->buffers, nullptr); - for (int64_t i = 0; i < c_export->n_buffers; ++i) { + + for (int64_t i = 0; i < expected_n_buffers; ++i) { auto expected_ptr = expected_buffers[i] ? expected_buffers[i]->data() : nullptr; ASSERT_EQ(c_export->buffers[i], expected_ptr); } + if (has_variadic_buffer_sizes) { + auto variadic_buffers = util::span(expected_data.buffers).subspan(2); + auto variadic_buffer_sizes = util::span( + static_cast(c_export->buffers[c_export->n_buffers - 1]), + variadic_buffers.size()); + for (auto [buf, size] : Zip(variadic_buffers, variadic_buffer_sizes)) { + ASSERT_EQ(buf->size(), size); + } + } if (expected_data.dictionary != nullptr) { // Recurse into dictionary @@ -883,6 +899,8 @@ TEST_F(TestArrayExport, Primitive) { TestPrimitive(large_binary(), R"(["foo", "bar", null])"); TestPrimitive(utf8(), R"(["foo", "bar", null])"); TestPrimitive(large_utf8(), R"(["foo", "bar", null])"); + TestPrimitive(binary_view(), R"(["foo", "bar", null])"); + TestPrimitive(utf8_view(), R"(["foo", "bar", null])"); TestPrimitive(decimal(16, 4), R"(["1234.5670", null])"); TestPrimitive(decimal256(16, 4), R"(["1234.5670", null])"); @@ -896,6 +914,39 @@ TEST_F(TestArrayExport, PrimitiveSliced) { TestPrimitive(factory); } +constexpr std::string_view binary_view_buffer_content0 = "12345foo bar baz quux", + binary_view_buffer_content1 = "BinaryViewMultipleBuffers"; + +static const BinaryViewType::c_type binary_view_buffer1[] = { + util::ToBinaryView(binary_view_buffer_content0, 0, 0), + util::ToInlineBinaryView("foo"), + util::ToBinaryView(binary_view_buffer_content1, 1, 0), + util::ToInlineBinaryView("bar"), + util::ToBinaryView(binary_view_buffer_content0.substr(5), 0, 5), + util::ToInlineBinaryView("baz"), + util::ToBinaryView(binary_view_buffer_content1.substr(6, 13), 1, 6), + util::ToInlineBinaryView("quux"), +}; + +static auto MakeBinaryViewArrayWithMultipleDataBuffers() { + static const auto kLength = static_cast(std::size(binary_view_buffer1)); + return std::make_shared( + binary_view(), kLength, + Buffer::FromVector(std::vector(binary_view_buffer1, binary_view_buffer1 + kLength)), + BufferVector{ + Buffer::FromString(std::string{binary_view_buffer_content0}), + Buffer::FromString(std::string{binary_view_buffer_content1}), + }); +} + +TEST_F(TestArrayExport, BinaryViewMultipleBuffers) { + TestPrimitive(MakeBinaryViewArrayWithMultipleDataBuffers); + TestPrimitive([&] { + auto arr = MakeBinaryViewArrayWithMultipleDataBuffers(); + return arr->Slice(1, arr->length() - 2); + }); +} + TEST_F(TestArrayExport, Null) { TestPrimitive(null(), "[null, null, null]"); TestPrimitive(null(), "[]"); @@ -1220,13 +1271,16 @@ TEST_F(TestArrayExport, ExportRecordBatch) { static const char kMyDeviceTypeName[] = "arrowtest::MyDevice"; static const ArrowDeviceType kMyDeviceType = ARROW_DEVICE_EXT_DEV; -static const void* kMyEventPtr = reinterpret_cast(uintptr_t(0xBAADF00D)); +static const void* kMyEventPtr = + reinterpret_cast(static_cast(0xBAADF00D)); class MyBuffer final : public MutableBuffer { public: using MutableBuffer::MutableBuffer; - ~MyBuffer() { default_memory_pool()->Free(const_cast(data_), size_); } + ~MyBuffer() override { + default_memory_pool()->Free(const_cast(data_), size_); + } std::shared_ptr device_sync_event() override { return device_sync_; } @@ -1256,7 +1310,7 @@ class MyDevice : public Device { explicit MySyncEvent(void* sync_event, release_fn_t release_sync_event) : Device::SyncEvent(sync_event, release_sync_event) {} - virtual ~MySyncEvent() = default; + ~MySyncEvent() override = default; Status Wait() override { return Status::OK(); } Status Record(const Device::Stream&) override { return Status::OK(); } }; @@ -1966,6 +2020,10 @@ TEST_F(TestSchemaImport, String) { CheckImport(large_utf8()); FillPrimitive("Z"); CheckImport(large_binary()); + FillPrimitive("vu"); + CheckImport(utf8_view()); + FillPrimitive("vz"); + CheckImport(binary_view()); FillPrimitive("w:3"); CheckImport(fixed_size_binary(3)); @@ -2419,6 +2477,16 @@ static const void* large_string_buffers_no_nulls1[3] = { static const void* large_string_buffers_omitted[3] = { nullptr, large_string_offsets_buffer1, nullptr}; +constexpr int64_t binary_view_buffer_sizes1[] = {binary_view_buffer_content0.size(), + binary_view_buffer_content1.size()}; +static const void* binary_view_buffers_no_nulls1[] = { + nullptr, + binary_view_buffer1, + binary_view_buffer_content0.data(), + binary_view_buffer_content1.data(), + binary_view_buffer_sizes1, +}; + static const int32_t list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; static const void* list_buffers_no_nulls1[2] = {nullptr, list_offsets_buffer1}; static const void* list_buffers_nulls1[2] = {bits_buffer1, list_offsets_buffer1}; @@ -2510,6 +2578,16 @@ class TestArrayImport : public ::testing::Test { c->buffers = buffers; } + void FillStringViewLike(struct ArrowArray* c, int64_t length, int64_t null_count, + int64_t offset, const void** buffers, + int32_t data_buffer_count) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 2 + data_buffer_count + 1; + c->buffers = buffers; + } + void FillListLike(struct ArrowArray* c, int64_t length, int64_t null_count, int64_t offset, const void** buffers) { c->length = length; @@ -2583,6 +2661,12 @@ class TestArrayImport : public ::testing::Test { FillStringLike(&c_struct_, length, null_count, offset, buffers); } + void FillStringViewLike(int64_t length, int64_t null_count, int64_t offset, + const void** buffers, int32_t data_buffer_count) { + FillStringViewLike(&c_struct_, length, null_count, offset, buffers, + data_buffer_count); + } + void FillListLike(int64_t length, int64_t null_count, int64_t offset, const void** buffers) { FillListLike(&c_struct_, length, null_count, offset, buffers); @@ -2834,6 +2918,10 @@ TEST_F(TestArrayImport, String) { FillStringLike(4, 0, 0, large_string_buffers_no_nulls1); CheckImport(ArrayFromJSON(large_binary(), R"(["foo", "", "bar", "quux"])")); + auto length = static_cast(std::size(binary_view_buffer1)); + FillStringViewLike(length, 0, 0, binary_view_buffers_no_nulls1, 2); + CheckImport(MakeBinaryViewArrayWithMultipleDataBuffers()); + // Empty array with null data pointers FillStringLike(0, 0, 0, string_buffers_omitted); CheckImport(ArrayFromJSON(utf8(), "[]")); @@ -3530,15 +3618,16 @@ TEST_F(TestSchemaRoundtrip, Primitive) { TestWithTypeFactory(boolean); TestWithTypeFactory(float16); - TestWithTypeFactory(std::bind(decimal128, 19, 4)); - TestWithTypeFactory(std::bind(decimal256, 19, 4)); - TestWithTypeFactory(std::bind(decimal128, 19, 0)); - TestWithTypeFactory(std::bind(decimal256, 19, 0)); - TestWithTypeFactory(std::bind(decimal128, 19, -5)); - TestWithTypeFactory(std::bind(decimal256, 19, -5)); - TestWithTypeFactory(std::bind(fixed_size_binary, 3)); + TestWithTypeFactory([] { return decimal128(19, 4); }); + TestWithTypeFactory([] { return decimal256(19, 4); }); + TestWithTypeFactory([] { return decimal128(19, 0); }); + TestWithTypeFactory([] { return decimal256(19, 0); }); + TestWithTypeFactory([] { return decimal128(19, -5); }); + TestWithTypeFactory([] { return decimal256(19, -5); }); + TestWithTypeFactory([] { return fixed_size_binary(3); }); TestWithTypeFactory(binary); TestWithTypeFactory(large_utf8); + TestWithTypeFactory(binary_view); } TEST_F(TestSchemaRoundtrip, Temporal) { @@ -3546,8 +3635,8 @@ TEST_F(TestSchemaRoundtrip, Temporal) { TestWithTypeFactory(day_time_interval); TestWithTypeFactory(month_interval); TestWithTypeFactory(month_day_nano_interval); - TestWithTypeFactory(std::bind(time64, TimeUnit::NANO)); - TestWithTypeFactory(std::bind(duration, TimeUnit::MICRO)); + TestWithTypeFactory([] { return time64(TimeUnit::NANO); }); + TestWithTypeFactory([] { return duration(TimeUnit::MICRO); }); TestWithTypeFactory([]() { return arrow::timestamp(TimeUnit::MICRO, "Europe/Paris"); }); } @@ -3803,6 +3892,14 @@ TEST_F(TestArrayRoundtrip, Primitive) { R"([[4, 5, 6], [1, -600, 5000], null, null])"); } +TEST_F(TestArrayRoundtrip, BinaryViewMultipleBuffers) { + TestWithArrayFactory(MakeBinaryViewArrayWithMultipleDataBuffers); + TestWithArrayFactory([&] { + auto arr = MakeBinaryViewArrayWithMultipleDataBuffers(); + return arr->Slice(1, arr->length() - 2); + }); +} + TEST_F(TestArrayRoundtrip, UnknownNullCount) { TestWithArrayFactory([]() -> Result> { auto arr = ArrayFromJSON(int32(), "[0, 1, 2]"); @@ -4205,8 +4302,6 @@ TEST_F(TestDeviceArrayRoundtrip, Primitive) { TestWithJSON(mm, int32(), "[4, 5, null]"); } -// TODO C -> C++ -> C roundtripping tests? - //////////////////////////////////////////////////////////////////////////// // Array stream export tests diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 7a3b45f9788e6..7211b6c87a055 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -42,7 +42,7 @@ BOOL = ArrowBool() -@click.group() +@click.group(context_settings={"help_option_names": ["-h", "--help"]}) @click.option("--debug", type=BOOL, is_flag=True, default=False, help="Increase logging with debugging output.") @click.option("--pdb", type=BOOL, is_flag=True, default=False, diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index ff10c0bb03fb6..80cc1c1e76425 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1858,9 +1858,7 @@ def _temp_path(): .skip_tester('Go') .skip_tester('Java') .skip_tester('JS') - .skip_tester('Rust') - .skip_format(SKIP_C_SCHEMA, 'C++') - .skip_format(SKIP_C_ARRAY, 'C++'), + .skip_tester('Rust'), generate_extension_case() .skip_tester('C#') diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index e2022171214b7..66cc7525822bf 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -140,10 +140,14 @@ strings: +-----------------+---------------------------------------------------+------------+ | ``Z`` | large binary | | +-----------------+---------------------------------------------------+------------+ +| ``vz`` | binary view | | ++-----------------+---------------------------------------------------+------------+ | ``u`` | utf-8 string | | +-----------------+---------------------------------------------------+------------+ | ``U`` | large utf-8 string | | +-----------------+---------------------------------------------------+------------+ +| ``vu`` | utf-8 view | | ++-----------------+---------------------------------------------------+------------+ | ``d:19,10`` | decimal128 [precision 19, scale 10] | | +-----------------+---------------------------------------------------+------------+ | ``d:19,10,NNN`` | decimal bitwidth = NNN [precision 19, scale 10] | | @@ -548,6 +552,14 @@ parameterized extension types). The ``ArrowArray`` structure exported from an extension array simply points to the storage data of the extension array. +Binary view arrays +------------------ + +For binary or utf-8 view arrays, an extra buffer is appended which stores +the lengths of each variadic data buffer as ``int64_t``. This buffer is +necessary since these buffer lengths are not trivially extractable from +other data in an array of binary or utf-8 view type. + .. _c-data-interface-semantics: Semantics