From f112912353923ee88c7d736e86f658d723f234ff Mon Sep 17 00:00:00 2001 From: Chengcheng Jin Date: Fri, 2 Aug 2024 12:13:56 +0000 Subject: [PATCH] address comments and fix test --- java/dataset/src/main/cpp/jni_wrapper.cc | 22 +++++++++---------- .../scanner/csv/CsvFragmentScanOptions.java | 5 +++-- .../dataset/TestFragmentScanOptions.java | 22 +++++++++++-------- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index 5b5347f4e74ee..63b8dd73f4720 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -368,9 +368,9 @@ std::shared_ptr LoadArrowBufferFromByteBuffer(JNIEnv* env, jobjec inline bool ParseBool(const std::string& value) { return value == "true" ? true : false; } -inline bool ParseChar(const std::string& value) { +inline bool ParseChar(const std::string& key, const std::string& value) { if (value.size() != 1) { - JniThrow("Csv convert option " + value + " should be a char"); + JniThrow("Option " + key + " should be a char, but is " + value); } return value.at(0); } @@ -405,7 +405,7 @@ bool SetCsvConvertOptions(arrow::csv::ConvertOptions& options, const std::string } else if (key == "auto_dict_max_cardinality") { options.auto_dict_max_cardinality = std::stoi(value); } else if (key == "decimal_point") { - options.decimal_point = ParseChar(value); + options.decimal_point = ParseChar(key, value); } else if (key == "include_missing_columns") { options.include_missing_columns = ParseBool(value); } else { @@ -417,17 +417,17 @@ bool SetCsvConvertOptions(arrow::csv::ConvertOptions& options, const std::string bool SetCsvParseOptions(arrow::csv::ParseOptions& options, const std::string& key, const std::string& value) { if (key == "delimiter") { - options.delimiter = ParseChar(value); + options.delimiter = ParseChar(key, value); } else if (key == "quoting") { options.quoting = ParseBool(value); } else if (key == "quote_char") { - options.quote_char = ParseChar(value); + options.quote_char = ParseChar(key, value); } else if (key == "double_quote") { options.double_quote = ParseBool(value); } else if (key == "escaping") { options.escaping = ParseBool(value); } else if (key == "escape_char") { - options.escape_char = ParseChar(value); + options.escape_char = ParseChar(key, value); } else if (key == "newlines_in_values") { options.newlines_in_values = ParseBool(value); } else if (key == "ignore_empty_lines") { @@ -456,18 +456,16 @@ bool SetCsvReadOptions(arrow::csv::ReadOptions& options, const std::string& key, return true; } -arrow::Result> -ToCsvFragmentScanOptions(const std::unordered_map& configs) { +std::shared_ptr ToCsvFragmentScanOptions( + const std::unordered_map& configs) { std::shared_ptr options = std::make_shared(); - for (const auto& it : configs) { - const auto& key = it.first; - const auto& value = it.second; + for (const auto& [key, value] : configs) { bool setValid = SetCsvParseOptions(options->parse_options, key, value) || SetCsvConvertOptions(options->convert_options, key, value) || SetCsvReadOptions(options->read_options, key, value); if (!setValid) { - return arrow::Status::Invalid("Config " + key + " is not supported."); + JniThrow("Config " + key + " is not supported."); } } return options; diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java index 2fe30d596e0db..dddc36d38714e 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java @@ -32,8 +32,9 @@ public class CsvFragmentScanOptions implements FragmentScanOptions { * CSV scan options, map to CPP struct CsvFragmentScanOptions. The key in config map is the field * name of mapping cpp struct * - *

If the option type is a std::vector in the CPP code, only support for setting one value. For - * example, for convert option null_values, only support set one string as null value. + *

Currently, multi-valued options (which are std::vector values in C++) only support having a + * single value set. For example, for the null_values option, only one string can be set as the + * null value. * * @param convertOptions similar to CsvFragmentScanOptions#convert_options in CPP, the ArrowSchema * represents column_types, convert data option such as null value recognition. diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java index 11f6fd9c1ac43..d598190528811 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java @@ -22,7 +22,6 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import com.google.common.collect.ImmutableMap; -import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.Map; @@ -46,6 +45,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.Text; import org.hamcrest.collection.IsIterableContainingInOrder; import org.junit.jupiter.api.Test; @@ -181,7 +181,7 @@ public void testCsvReadParseAndReadOptions() throws Exception { CsvFragmentScanOptions fragmentScanOptions = new CsvFragmentScanOptions( new CsvConvertOptions(ImmutableMap.of()), - ImmutableMap.of("skip_rows", "1"), + ImmutableMap.of("skip_rows_after_names", "1"), ImmutableMap.of("delimiter", ";")); ScanOptions options = new ScanOptions.Builder(/*batchSize*/ 32768) @@ -190,7 +190,11 @@ public void testCsvReadParseAndReadOptions() throws Exception { .build(); try (DatasetFactory datasetFactory = new FileSystemDatasetFactory( - allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path); + allocator, + NativeMemoryPool.getDefault(), + FileFormat.CSV, + path, + Optional.of(fragmentScanOptions)); Dataset dataset = datasetFactory.finish(); Scanner scanner = dataset.newScan(options); ArrowReader reader = scanner.scanBatches()) { @@ -198,12 +202,12 @@ public void testCsvReadParseAndReadOptions() throws Exception { assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields()); int rowCount = 0; while (reader.loadNextBatch()) { - final ValueIterableVector idVector = - (ValueIterableVector) - reader.getVectorSchemaRoot().getVector("Id;Name;Language"); + final ValueIterableVector idVector = + (ValueIterableVector) reader.getVectorSchemaRoot().getVector("Id;Name;Language"); assertThat( idVector.getValueIterable(), - IsIterableContainingInOrder.contains("2;Peter;Python\n" + "3;Celin;C++")); + IsIterableContainingInOrder.contains( + new Text("2;Peter;Python"), new Text("3;Celin;C++"))); rowCount += reader.getVectorSchemaRoot().getRowCount(); } assertEquals(2, rowCount); @@ -297,7 +301,7 @@ public void testCsvInvalidOption() throws Exception { new FileSystemDatasetFactory( allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path); Dataset dataset = datasetFactory.finish()) { - assertThrows(IOException.class, () -> dataset.newScan(options)); + assertThrows(RuntimeException.class, () -> dataset.newScan(options)); } CsvFragmentScanOptions fragmentScanOptionsFaultValue = @@ -314,7 +318,7 @@ public void testCsvInvalidOption() throws Exception { new FileSystemDatasetFactory( allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path); Dataset dataset = datasetFactory.finish()) { - assertThrows(Throwable.class, () -> dataset.newScan(optionsFault)); + assertThrows(RuntimeException.class, () -> dataset.newScan(optionsFault)); } } }