Skip to content

Commit

Permalink
address comments and fix test
Browse files Browse the repository at this point in the history
  • Loading branch information
jinchengchenghh committed Aug 2, 2024
1 parent a2d4fde commit f112912
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 23 deletions.
22 changes: 10 additions & 12 deletions java/dataset/src/main/cpp/jni_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,9 @@ std::shared_ptr<arrow::Buffer> LoadArrowBufferFromByteBuffer(JNIEnv* env, jobjec

inline bool ParseBool(const std::string& value) { return value == "true" ? true : false; }

inline bool ParseChar(const std::string& value) {
inline bool ParseChar(const std::string& key, const std::string& value) {
if (value.size() != 1) {
JniThrow("Csv convert option " + value + " should be a char");
JniThrow("Option " + key + " should be a char, but is " + value);
}
return value.at(0);
}
Expand Down Expand Up @@ -405,7 +405,7 @@ bool SetCsvConvertOptions(arrow::csv::ConvertOptions& options, const std::string
} else if (key == "auto_dict_max_cardinality") {
options.auto_dict_max_cardinality = std::stoi(value);
} else if (key == "decimal_point") {
options.decimal_point = ParseChar(value);
options.decimal_point = ParseChar(key, value);
} else if (key == "include_missing_columns") {
options.include_missing_columns = ParseBool(value);
} else {
Expand All @@ -417,17 +417,17 @@ bool SetCsvConvertOptions(arrow::csv::ConvertOptions& options, const std::string
bool SetCsvParseOptions(arrow::csv::ParseOptions& options, const std::string& key,
const std::string& value) {
if (key == "delimiter") {
options.delimiter = ParseChar(value);
options.delimiter = ParseChar(key, value);
} else if (key == "quoting") {
options.quoting = ParseBool(value);
} else if (key == "quote_char") {
options.quote_char = ParseChar(value);
options.quote_char = ParseChar(key, value);
} else if (key == "double_quote") {
options.double_quote = ParseBool(value);
} else if (key == "escaping") {
options.escaping = ParseBool(value);
} else if (key == "escape_char") {
options.escape_char = ParseChar(value);
options.escape_char = ParseChar(key, value);
} else if (key == "newlines_in_values") {
options.newlines_in_values = ParseBool(value);
} else if (key == "ignore_empty_lines") {
Expand Down Expand Up @@ -456,18 +456,16 @@ bool SetCsvReadOptions(arrow::csv::ReadOptions& options, const std::string& key,
return true;
}

arrow::Result<std::shared_ptr<arrow::dataset::FragmentScanOptions>>
ToCsvFragmentScanOptions(const std::unordered_map<std::string, std::string>& configs) {
std::shared_ptr<arrow::dataset::FragmentScanOptions> ToCsvFragmentScanOptions(
const std::unordered_map<std::string, std::string>& configs) {
std::shared_ptr<arrow::dataset::CsvFragmentScanOptions> options =
std::make_shared<arrow::dataset::CsvFragmentScanOptions>();
for (const auto& it : configs) {
const auto& key = it.first;
const auto& value = it.second;
for (const auto& [key, value] : configs) {
bool setValid = SetCsvParseOptions(options->parse_options, key, value) ||
SetCsvConvertOptions(options->convert_options, key, value) ||
SetCsvReadOptions(options->read_options, key, value);
if (!setValid) {
return arrow::Status::Invalid("Config " + key + " is not supported.");
JniThrow("Config " + key + " is not supported.");
}
}
return options;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ public class CsvFragmentScanOptions implements FragmentScanOptions {
* CSV scan options, map to CPP struct CsvFragmentScanOptions. The key in config map is the field
* name of mapping cpp struct
*
* <p>If the option type is a std::vector in the CPP code, only support for setting one value. For
* example, for convert option null_values, only support set one string as null value.
* <p>Currently, multi-valued options (which are std::vector values in C++) only support having a
* single value set. For example, for the null_values option, only one string can be set as the
* null value.
*
* @param convertOptions similar to CsvFragmentScanOptions#convert_options in CPP, the ArrowSchema
* represents column_types, convert data option such as null value recognition.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import static org.junit.jupiter.api.Assertions.assertThrows;

import com.google.common.collect.ImmutableMap;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
Expand All @@ -46,6 +45,7 @@
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.util.Text;
import org.hamcrest.collection.IsIterableContainingInOrder;
import org.junit.jupiter.api.Test;

Expand Down Expand Up @@ -181,7 +181,7 @@ public void testCsvReadParseAndReadOptions() throws Exception {
CsvFragmentScanOptions fragmentScanOptions =
new CsvFragmentScanOptions(
new CsvConvertOptions(ImmutableMap.of()),
ImmutableMap.of("skip_rows", "1"),
ImmutableMap.of("skip_rows_after_names", "1"),
ImmutableMap.of("delimiter", ";"));
ScanOptions options =
new ScanOptions.Builder(/*batchSize*/ 32768)
Expand All @@ -190,20 +190,24 @@ public void testCsvReadParseAndReadOptions() throws Exception {
.build();
try (DatasetFactory datasetFactory =
new FileSystemDatasetFactory(
allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path);
allocator,
NativeMemoryPool.getDefault(),
FileFormat.CSV,
path,
Optional.of(fragmentScanOptions));
Dataset dataset = datasetFactory.finish();
Scanner scanner = dataset.newScan(options);
ArrowReader reader = scanner.scanBatches()) {

assertEquals(schema.getFields(), reader.getVectorSchemaRoot().getSchema().getFields());
int rowCount = 0;
while (reader.loadNextBatch()) {
final ValueIterableVector<String> idVector =
(ValueIterableVector<String>)
reader.getVectorSchemaRoot().getVector("Id;Name;Language");
final ValueIterableVector<Text> idVector =
(ValueIterableVector<Text>) reader.getVectorSchemaRoot().getVector("Id;Name;Language");
assertThat(
idVector.getValueIterable(),
IsIterableContainingInOrder.contains("2;Peter;Python\n" + "3;Celin;C++"));
IsIterableContainingInOrder.contains(
new Text("2;Peter;Python"), new Text("3;Celin;C++")));
rowCount += reader.getVectorSchemaRoot().getRowCount();
}
assertEquals(2, rowCount);
Expand Down Expand Up @@ -297,7 +301,7 @@ public void testCsvInvalidOption() throws Exception {
new FileSystemDatasetFactory(
allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path);
Dataset dataset = datasetFactory.finish()) {
assertThrows(IOException.class, () -> dataset.newScan(options));
assertThrows(RuntimeException.class, () -> dataset.newScan(options));
}

CsvFragmentScanOptions fragmentScanOptionsFaultValue =
Expand All @@ -314,7 +318,7 @@ public void testCsvInvalidOption() throws Exception {
new FileSystemDatasetFactory(
allocator, NativeMemoryPool.getDefault(), FileFormat.CSV, path);
Dataset dataset = datasetFactory.finish()) {
assertThrows(Throwable.class, () -> dataset.newScan(optionsFault));
assertThrows(RuntimeException.class, () -> dataset.newScan(optionsFault));
}
}
}

0 comments on commit f112912

Please sign in to comment.