Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-36099: [C++] Add Utf8View and BinaryView to the c ABI #38443

Merged
merged 4 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 75 additions & 14 deletions cpp/src/arrow/c/bridge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <algorithm>
#include <cerrno>
#include <cstring>
#include <memory>
#include <string>
#include <string_view>
#include <utility>
Expand All @@ -41,6 +42,7 @@
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/range.h"
#include "arrow/util/small_vector.h"
#include "arrow/util/string.h"
#include "arrow/util/value_parsing.h"
Expand All @@ -51,6 +53,7 @@ namespace arrow {
using internal::checked_cast;
using internal::checked_pointer_cast;


using internal::SmallVector;
using internal::StaticVector;

Expand Down Expand Up @@ -260,7 +263,7 @@ struct SchemaExporter {
// Dictionary type: parent struct describes index type,
// child dictionary struct describes value type.
RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), this));
dict_exporter_.reset(new SchemaExporter());
dict_exporter_ = std::make_unique<SchemaExporter>();
Comment on lines -263 to +265
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i've actually wondered.... is there any particular difference (performance or otherwise) doing .reset(new T) vs = std::make_unique<T>()?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In scenarios where you have an std::unique_ptr<ParentClass> being populated by many sub-classes of ParentClass:

std::make_unique<T, Deleter>(...) can lead to more binary bloat because class Deleter = std::default_delete<T> is type-specialized whereas reset() takes a super-class pointer and only needs the deleter that invokes the destructor via dynamic dispatching through the parent class vtable.

Being type-specialized often leads to the derived-classes destructors' being inlined into the std::default_deleter<T>. All this extra inlining and removal of one indirection could lead to less overhead of destructor dispatching.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that case, wouldn't it be better to keep this as calling .reset rather than switch to make_unique?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case there is no difference since the assigned pointer is a unique_ptr<SchemaExporter> and not a pointer to a base/parent class.

RETURN_NOT_OK(dict_exporter_->ExportType(*dict_type.value_type()));
} else {
RETURN_NOT_OK(VisitTypeInline(type, this));
Expand Down Expand Up @@ -357,10 +360,14 @@ struct SchemaExporter {

Status Visit(const LargeBinaryType& type) { return SetFormat("Z"); }

Status Visit(const BinaryViewType& type) { return SetFormat("vz"); }

Status Visit(const StringType& type) { return SetFormat("u"); }

Status Visit(const LargeStringType& type) { return SetFormat("U"); }

Status Visit(const StringViewType& type) { return SetFormat("vu"); }

Status Visit(const Date32Type& type) { return SetFormat("tdD"); }

Status Visit(const Date64Type& type) { return SetFormat("tdm"); }
Expand Down Expand Up @@ -521,13 +528,14 @@ namespace {

struct ExportedArrayPrivateData : PoolAllocationMixin<ExportedArrayPrivateData> {
// The buffers are owned by the ArrayData member
StaticVector<const void*, 3> buffers_;
SmallVector<const void*, 3> buffers_;
struct ArrowArray dictionary_;
SmallVector<struct ArrowArray, 1> children_;
SmallVector<struct ArrowArray*, 4> child_pointers_;

std::shared_ptr<ArrayData> data_;
std::shared_ptr<Device::SyncEvent> sync_;
std::vector<int64_t> variadic_buffer_sizes_;

ExportedArrayPrivateData() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedArrayPrivateData);
Expand Down Expand Up @@ -570,15 +578,32 @@ struct ArrayExporter {
--n_buffers;
++buffers_begin;
}

bool need_variadic_buffer_sizes =
data->type->id() == Type::BINARY_VIEW || data->type->id() == Type::STRING_VIEW;
if (need_variadic_buffer_sizes) {
++n_buffers;
}

export_.buffers_.resize(n_buffers);
std::transform(buffers_begin, data->buffers.end(), export_.buffers_.begin(),
[](const std::shared_ptr<Buffer>& buffer) -> const void* {
return buffer ? buffer->data() : nullptr;
});

if (need_variadic_buffer_sizes) {
auto variadic_buffers = util::span(data->buffers).subspan(2);
export_.variadic_buffer_sizes_.resize(variadic_buffers.size());
size_t i = 0;
for (const auto& buf : variadic_buffers) {
export_.variadic_buffer_sizes_[i++] = buf->size();
}
export_.buffers_.back() = export_.variadic_buffer_sizes_.data();
}

// Export dictionary
if (data->dictionary != nullptr) {
dict_exporter_.reset(new ArrayExporter());
dict_exporter_ = std::make_unique<ArrayExporter>();
RETURN_NOT_OK(dict_exporter_->Export(data->dictionary));
}

Expand Down Expand Up @@ -795,7 +820,7 @@ Status InvalidFormatString(std::string_view v) {

class FormatStringParser {
public:
FormatStringParser() {}
FormatStringParser() = default;

explicit FormatStringParser(std::string_view v) : view_(v), index_(0) {}

Expand Down Expand Up @@ -941,8 +966,6 @@ Result<DecodedMetadata> DecodeMetadata(const char* metadata) {
}

struct SchemaImporter {
SchemaImporter() : c_struct_(nullptr), guard_(nullptr) {}

Status Import(struct ArrowSchema* src) {
if (ArrowSchemaIsReleased(src)) {
return Status::Invalid("Cannot import released ArrowSchema");
Expand Down Expand Up @@ -1068,6 +1091,8 @@ struct SchemaImporter {
return ProcessPrimitive(binary());
case 'Z':
return ProcessPrimitive(large_binary());
case 'v':
return ProcessBinaryView();
case 'w':
return ProcessFixedSizeBinary();
case 'd':
Expand All @@ -1080,6 +1105,17 @@ struct SchemaImporter {
return f_parser_.Invalid();
}

Status ProcessBinaryView() {
RETURN_NOT_OK(f_parser_.CheckHasNext());
switch (f_parser_.Next()) {
case 'z':
return ProcessPrimitive(binary_view());
case 'u':
return ProcessPrimitive(utf8_view());
}
return f_parser_.Invalid();
}

Status ProcessTemporal() {
RETURN_NOT_OK(f_parser_.CheckHasNext());
switch (f_parser_.Next()) {
Expand Down Expand Up @@ -1360,8 +1396,8 @@ struct SchemaImporter {
return Status::OK();
}

struct ArrowSchema* c_struct_;
SchemaExportGuard guard_;
struct ArrowSchema* c_struct_{nullptr};
SchemaExportGuard guard_{nullptr};
FormatStringParser f_parser_;
int64_t recursion_level_;
std::vector<SchemaImporter> child_importers_;
Expand Down Expand Up @@ -1429,7 +1465,7 @@ class ImportedBuffer : public Buffer {
std::shared_ptr<ImportedArrayData> import)
: Buffer(data, size, mm, nullptr, device_type), import_(std::move(import)) {}

~ImportedBuffer() override {}
~ImportedBuffer() override = default;

std::shared_ptr<Device::SyncEvent> device_sync_event() override {
return import_->device_sync_;
Expand All @@ -1441,9 +1477,7 @@ class ImportedBuffer : public Buffer {

struct ArrayImporter {
explicit ArrayImporter(const std::shared_ptr<DataType>& type)
: type_(type),
zero_size_buffer_(std::make_shared<Buffer>(kZeroSizeArea, 0)),
device_type_(DeviceAllocationType::kCPU) {}
bkietz marked this conversation as resolved.
Show resolved Hide resolved
: type_(type), zero_size_buffer_(std::make_shared<Buffer>(kZeroSizeArea, 0)) {}

Status Import(struct ArrowDeviceArray* src, const DeviceMemoryMapper& mapper) {
ARROW_ASSIGN_OR_RAISE(memory_mgr_, mapper(src->device_type, src->device_id));
Expand Down Expand Up @@ -1591,6 +1625,10 @@ struct ArrayImporter {

Status Visit(const LargeBinaryType& type) { return ImportStringLike(type); }

Status Visit(const StringViewType& type) { return ImportBinaryView(type); }

Status Visit(const BinaryViewType& type) { return ImportBinaryView(type); }

Status Visit(const ListType& type) { return ImportListLike(type); }

Status Visit(const LargeListType& type) { return ImportListLike(type); }
Expand Down Expand Up @@ -1673,6 +1711,28 @@ struct ArrayImporter {
return Status::OK();
}

Status ImportBinaryView(const BinaryViewType&) {
RETURN_NOT_OK(CheckNoChildren());
if (c_struct_->n_buffers < 3) {
return Status::Invalid("Expected at least 3 buffers for imported type ",
type_->ToString(), ", ArrowArray struct has ",
c_struct_->n_buffers);
}
RETURN_NOT_OK(AllocateArrayData());
RETURN_NOT_OK(ImportNullBitmap());
RETURN_NOT_OK(ImportFixedSizeBuffer(1, BinaryViewType::kSize));

// The last C data buffer stores buffer sizes, and shouldn't be imported
auto* buffer_sizes =
static_cast<const int64_t*>(c_struct_->buffers[c_struct_->n_buffers - 1]);

for (int32_t buffer_id = 2; buffer_id < c_struct_->n_buffers - 1; ++buffer_id) {
RETURN_NOT_OK(ImportBuffer(buffer_id, buffer_sizes[buffer_id - 2]));
}
data_->buffers.pop_back();
return Status::OK();
}

template <typename StringType>
Status ImportStringLike(const StringType& type) {
RETURN_NOT_OK(CheckNoChildren());
Expand Down Expand Up @@ -1836,7 +1896,8 @@ struct ArrayImporter {
std::shared_ptr<Buffer> zero_size_buffer_;

std::shared_ptr<MemoryManager> memory_mgr_;
DeviceAllocationType device_type_;
DeviceAllocationType device_type_{DeviceAllocationType::kCPU};
;
};

} // namespace
Expand Down Expand Up @@ -2042,7 +2103,7 @@ class ArrayStreamBatchReader : public RecordBatchReader {
DCHECK(!ArrowArrayStreamIsReleased(&stream_));
}

~ArrayStreamBatchReader() {
~ArrayStreamBatchReader() override {
if (!ArrowArrayStreamIsReleased(&stream_)) {
ArrowArrayStreamRelease(&stream_);
}
Expand Down
Loading
Loading