apache · bkietz · Nov 29, 2023 · Oct 24, 2023 · Nov 9, 2023 · Nov 29, 2023
diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc
@@ -20,6 +20,7 @@
 #include <algorithm>
 #include <cerrno>
 #include <cstring>
+#include <memory>
 #include <string>
 #include <string_view>
 #include <utility>
@@ -41,6 +42,7 @@
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
+#include "arrow/util/range.h"
 #include "arrow/util/small_vector.h"
 #include "arrow/util/string.h"
 #include "arrow/util/value_parsing.h"
@@ -51,6 +53,7 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
+
 using internal::SmallVector;
 using internal::StaticVector;
 
@@ -260,7 +263,7 @@ struct SchemaExporter {
       // Dictionary type: parent struct describes index type,
       // child dictionary struct describes value type.
       RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), this));
-      dict_exporter_.reset(new SchemaExporter());
+      dict_exporter_ = std::make_unique<SchemaExporter>();
       RETURN_NOT_OK(dict_exporter_->ExportType(*dict_type.value_type()));
     } else {
       RETURN_NOT_OK(VisitTypeInline(type, this));
@@ -357,10 +360,14 @@ struct SchemaExporter {
 
   Status Visit(const LargeBinaryType& type) { return SetFormat("Z"); }
 
+  Status Visit(const BinaryViewType& type) { return SetFormat("vz"); }
+
   Status Visit(const StringType& type) { return SetFormat("u"); }
 
   Status Visit(const LargeStringType& type) { return SetFormat("U"); }
 
+  Status Visit(const StringViewType& type) { return SetFormat("vu"); }
+
   Status Visit(const Date32Type& type) { return SetFormat("tdD"); }
 
   Status Visit(const Date64Type& type) { return SetFormat("tdm"); }
@@ -521,13 +528,14 @@ namespace {
 
 struct ExportedArrayPrivateData : PoolAllocationMixin<ExportedArrayPrivateData> {
   // The buffers are owned by the ArrayData member
-  StaticVector<const void*, 3> buffers_;
+  SmallVector<const void*, 3> buffers_;
   struct ArrowArray dictionary_;
   SmallVector<struct ArrowArray, 1> children_;
   SmallVector<struct ArrowArray*, 4> child_pointers_;
 
   std::shared_ptr<ArrayData> data_;
   std::shared_ptr<Device::SyncEvent> sync_;
+  std::vector<int64_t> variadic_buffer_sizes_;
 
   ExportedArrayPrivateData() = default;
   ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedArrayPrivateData);
@@ -570,15 +578,32 @@ struct ArrayExporter {
       --n_buffers;
       ++buffers_begin;
     }
+
+    bool need_variadic_buffer_sizes =
+        data->type->id() == Type::BINARY_VIEW || data->type->id() == Type::STRING_VIEW;
+    if (need_variadic_buffer_sizes) {
+      ++n_buffers;
+    }
+
     export_.buffers_.resize(n_buffers);
     std::transform(buffers_begin, data->buffers.end(), export_.buffers_.begin(),
                    [](const std::shared_ptr<Buffer>& buffer) -> const void* {
                      return buffer ? buffer->data() : nullptr;
                    });
 
+    if (need_variadic_buffer_sizes) {
+      auto variadic_buffers = util::span(data->buffers).subspan(2);
+      export_.variadic_buffer_sizes_.resize(variadic_buffers.size());
+      size_t i = 0;
+      for (const auto& buf : variadic_buffers) {
+        export_.variadic_buffer_sizes_[i++] = buf->size();
+      }
+      export_.buffers_.back() = export_.variadic_buffer_sizes_.data();
+    }
+
     // Export dictionary
     if (data->dictionary != nullptr) {
-      dict_exporter_.reset(new ArrayExporter());
+      dict_exporter_ = std::make_unique<ArrayExporter>();
       RETURN_NOT_OK(dict_exporter_->Export(data->dictionary));
     }
 
@@ -795,7 +820,7 @@ Status InvalidFormatString(std::string_view v) {
 
 class FormatStringParser {
  public:
-  FormatStringParser() {}
+  FormatStringParser() = default;
 
   explicit FormatStringParser(std::string_view v) : view_(v), index_(0) {}
 
@@ -941,8 +966,6 @@ Result<DecodedMetadata> DecodeMetadata(const char* metadata) {
 }
 
 struct SchemaImporter {
-  SchemaImporter() : c_struct_(nullptr), guard_(nullptr) {}
-
   Status Import(struct ArrowSchema* src) {
     if (ArrowSchemaIsReleased(src)) {
       return Status::Invalid("Cannot import released ArrowSchema");
@@ -1068,6 +1091,8 @@ struct SchemaImporter {
         return ProcessPrimitive(binary());
       case 'Z':
         return ProcessPrimitive(large_binary());
+      case 'v':
+        return ProcessBinaryView();
       case 'w':
         return ProcessFixedSizeBinary();
       case 'd':
@@ -1080,6 +1105,17 @@ struct SchemaImporter {
     return f_parser_.Invalid();
   }
 
+  Status ProcessBinaryView() {
+    RETURN_NOT_OK(f_parser_.CheckHasNext());
+    switch (f_parser_.Next()) {
+      case 'z':
+        return ProcessPrimitive(binary_view());
+      case 'u':
+        return ProcessPrimitive(utf8_view());
+    }
+    return f_parser_.Invalid();
+  }
+
   Status ProcessTemporal() {
     RETURN_NOT_OK(f_parser_.CheckHasNext());
     switch (f_parser_.Next()) {
@@ -1360,8 +1396,8 @@ struct SchemaImporter {
     return Status::OK();
   }
 
-  struct ArrowSchema* c_struct_;
-  SchemaExportGuard guard_;
+  struct ArrowSchema* c_struct_{nullptr};
+  SchemaExportGuard guard_{nullptr};
   FormatStringParser f_parser_;
   int64_t recursion_level_;
   std::vector<SchemaImporter> child_importers_;
@@ -1429,7 +1465,7 @@ class ImportedBuffer : public Buffer {
                  std::shared_ptr<ImportedArrayData> import)
       : Buffer(data, size, mm, nullptr, device_type), import_(std::move(import)) {}
 
-  ~ImportedBuffer() override {}
+  ~ImportedBuffer() override = default;
 
   std::shared_ptr<Device::SyncEvent> device_sync_event() override {
     return import_->device_sync_;
@@ -1441,9 +1477,7 @@ class ImportedBuffer : public Buffer {
 
 struct ArrayImporter {
   explicit ArrayImporter(const std::shared_ptr<DataType>& type)
-      : type_(type),
-        zero_size_buffer_(std::make_shared<Buffer>(kZeroSizeArea, 0)),
-        device_type_(DeviceAllocationType::kCPU) {}
+      : type_(type), zero_size_buffer_(std::make_shared<Buffer>(kZeroSizeArea, 0)) {}
 
   Status Import(struct ArrowDeviceArray* src, const DeviceMemoryMapper& mapper) {
     ARROW_ASSIGN_OR_RAISE(memory_mgr_, mapper(src->device_type, src->device_id));
@@ -1591,6 +1625,10 @@ struct ArrayImporter {
 
   Status Visit(const LargeBinaryType& type) { return ImportStringLike(type); }
 
+  Status Visit(const StringViewType& type) { return ImportBinaryView(type); }
+
+  Status Visit(const BinaryViewType& type) { return ImportBinaryView(type); }
+
   Status Visit(const ListType& type) { return ImportListLike(type); }
 
   Status Visit(const LargeListType& type) { return ImportListLike(type); }
@@ -1673,6 +1711,28 @@ struct ArrayImporter {
     return Status::OK();
   }
 
+  Status ImportBinaryView(const BinaryViewType&) {
+    RETURN_NOT_OK(CheckNoChildren());
+    if (c_struct_->n_buffers < 3) {
+      return Status::Invalid("Expected at least 3 buffers for imported type ",
+                             type_->ToString(), ", ArrowArray struct has ",
+                             c_struct_->n_buffers);
+    }
+    RETURN_NOT_OK(AllocateArrayData());
+    RETURN_NOT_OK(ImportNullBitmap());
+    RETURN_NOT_OK(ImportFixedSizeBuffer(1, BinaryViewType::kSize));
+
+    // The last C data buffer stores buffer sizes, and shouldn't be imported
+    auto* buffer_sizes =
+        static_cast<const int64_t*>(c_struct_->buffers[c_struct_->n_buffers - 1]);
+
+    for (int32_t buffer_id = 2; buffer_id < c_struct_->n_buffers - 1; ++buffer_id) {
+      RETURN_NOT_OK(ImportBuffer(buffer_id, buffer_sizes[buffer_id - 2]));
+    }
+    data_->buffers.pop_back();
+    return Status::OK();
+  }
+
   template <typename StringType>
   Status ImportStringLike(const StringType& type) {
     RETURN_NOT_OK(CheckNoChildren());
@@ -1836,7 +1896,8 @@ struct ArrayImporter {
   std::shared_ptr<Buffer> zero_size_buffer_;
 
   std::shared_ptr<MemoryManager> memory_mgr_;
-  DeviceAllocationType device_type_;
+  DeviceAllocationType device_type_{DeviceAllocationType::kCPU};
+;
 };
 
 }  // namespace
@@ -2042,7 +2103,7 @@ class ArrayStreamBatchReader : public RecordBatchReader {
     DCHECK(!ArrowArrayStreamIsReleased(&stream_));
   }
 
-  ~ArrayStreamBatchReader() {
+  ~ArrayStreamBatchReader() override {
     if (!ArrowArrayStreamIsReleased(&stream_)) {
       ArrowArrayStreamRelease(&stream_);
     }