Skip to content

Commit

Permalink
implement string_to_array (#7577)
Browse files Browse the repository at this point in the history
* implement string_to_array

* string_to_array doc and test updates

* move string_to_array from string functions to array functions
  • Loading branch information
casperhart committed Sep 18, 2023
1 parent d512bee commit 81f33b0
Show file tree
Hide file tree
Showing 12 changed files with 202 additions and 2 deletions.
17 changes: 16 additions & 1 deletion datafusion/expr/src/built_in_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ pub enum BuiltinScalarFunction {
SHA512,
/// split_part
SplitPart,
/// string_to_array
StringToArray,
/// starts_with
StartsWith,
/// strpos
Expand Down Expand Up @@ -426,6 +428,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::SHA512 => Volatility::Immutable,
BuiltinScalarFunction::Digest => Volatility::Immutable,
BuiltinScalarFunction::SplitPart => Volatility::Immutable,
BuiltinScalarFunction::StringToArray => Volatility::Immutable,
BuiltinScalarFunction::StartsWith => Volatility::Immutable,
BuiltinScalarFunction::Strpos => Volatility::Immutable,
BuiltinScalarFunction::Substr => Volatility::Immutable,
Expand Down Expand Up @@ -711,6 +714,11 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::SplitPart => {
utf8_to_str_type(&input_expr_types[0], "split_part")
}
BuiltinScalarFunction::StringToArray => Ok(List(Arc::new(Field::new(
"item",
input_expr_types[0].clone(),
true,
)))),
BuiltinScalarFunction::StartsWith => Ok(Boolean),
BuiltinScalarFunction::Strpos => {
utf8_to_int_type(&input_expr_types[0], "strpos")
Expand Down Expand Up @@ -1068,7 +1076,13 @@ impl BuiltinScalarFunction {
],
self.volatility(),
),

BuiltinScalarFunction::StringToArray => Signature::one_of(
vec![
TypeSignature::Uniform(2, vec![Utf8, LargeUtf8]),
TypeSignature::Uniform(3, vec![Utf8, LargeUtf8]),
],
self.volatility(),
),
BuiltinScalarFunction::Strpos | BuiltinScalarFunction::StartsWith => {
Signature::one_of(
vec![
Expand Down Expand Up @@ -1279,6 +1293,7 @@ fn aliases(func: &BuiltinScalarFunction) -> &'static [&'static str] {
BuiltinScalarFunction::Rpad => &["rpad"],
BuiltinScalarFunction::Rtrim => &["rtrim"],
BuiltinScalarFunction::SplitPart => &["split_part"],
BuiltinScalarFunction::StringToArray => &["string_to_array", "string_to_list"],
BuiltinScalarFunction::StartsWith => &["starts_with"],
BuiltinScalarFunction::Strpos => &["strpos"],
BuiltinScalarFunction::Substr => &["substr"],
Expand Down
2 changes: 2 additions & 0 deletions datafusion/expr/src/expr_fn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,7 @@ scalar_expr!(SHA256, sha256, string, "SHA-256 hash");
scalar_expr!(SHA384, sha384, string, "SHA-384 hash");
scalar_expr!(SHA512, sha512, string, "SHA-512 hash");
scalar_expr!(SplitPart, split_part, string delimiter index, "splits a string based on a delimiter and picks out the desired field based on the index.");
scalar_expr!(StringToArray, string_to_array, string delimiter null_string, "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`");
scalar_expr!(StartsWith, starts_with, string prefix, "whether the `string` starts with the `prefix`");
scalar_expr!(Strpos, strpos, string substring, "finds the position from where the `substring` matches the `string`");
scalar_expr!(Substr, substr, string position, "substring from the `position` to the end");
Expand Down Expand Up @@ -1080,6 +1081,7 @@ mod test {
test_scalar_expr!(SHA384, sha384, string);
test_scalar_expr!(SHA512, sha512, string);
test_scalar_expr!(SplitPart, split_part, expr, delimiter, index);
test_scalar_expr!(StringToArray, string_to_array, expr, delimiter, null_value);
test_scalar_expr!(StartsWith, starts_with, string, characters);
test_scalar_expr!(Strpos, strpos, string, substring);
test_scalar_expr!(Substr, substr, string, position);
Expand Down
89 changes: 89 additions & 0 deletions datafusion/physical-expr/src/array_expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1864,6 +1864,95 @@ pub fn array_has_all(args: &[ArrayRef]) -> Result<ArrayRef> {
Ok(Arc::new(boolean_builder.finish()))
}

/// Splits string at occurrences of delimiter and returns an array of parts
/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]'
pub fn string_to_array<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;
let delimiter_array = as_generic_string_array::<T>(&args[1])?;

let mut list_builder = ListBuilder::new(StringBuilder::with_capacity(
string_array.len(),
string_array.get_buffer_memory_size(),
));

match args.len() {
2 => {
string_array.iter().zip(delimiter_array.iter()).for_each(
|(string, delimiter)| {
match (string, delimiter) {
(Some(string), Some("")) => {
list_builder.values().append_value(string);
list_builder.append(true);
}
(Some(string), Some(delimiter)) => {
string.split(delimiter).for_each(|s| {
list_builder.values().append_value(s);
});
list_builder.append(true);
}
(Some(string), None) => {
string.chars().map(|c| c.to_string()).for_each(|c| {
list_builder.values().append_value(c);
});
list_builder.append(true);
}
_ => list_builder.append(false), // null value
}
},
);
}

3 => {
let null_value_array = as_generic_string_array::<T>(&args[2])?;
string_array
.iter()
.zip(delimiter_array.iter())
.zip(null_value_array.iter())
.for_each(|((string, delimiter), null_value)| {
match (string, delimiter) {
(Some(string), Some("")) => {
if Some(string) == null_value {
list_builder.values().append_null();
} else {
list_builder.values().append_value(string);
}
list_builder.append(true);
}
(Some(string), Some(delimiter)) => {
string.split(delimiter).for_each(|s| {
if Some(s) == null_value {
list_builder.values().append_null();
} else {
list_builder.values().append_value(s);
}
});
list_builder.append(true);
}
(Some(string), None) => {
string.chars().map(|c| c.to_string()).for_each(|c| {
if Some(c.as_str()) == null_value {
list_builder.values().append_null();
} else {
list_builder.values().append_value(c);
}
});
list_builder.append(true);
}
_ => list_builder.append(false), // null value
}
});
}
_ => {
return internal_err!(
"Expect string_to_array function to take two or three parameters"
)
}
}

let list_array = list_builder.finish();
Ok(Arc::new(list_array) as ArrayRef)
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
15 changes: 15 additions & 0 deletions datafusion/physical-expr/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,21 @@ pub fn create_physical_fun(
internal_err!("Unsupported data type {other:?} for function split_part")
}
}),
BuiltinScalarFunction::StringToArray => {
Arc::new(|args| match args[0].data_type() {
DataType::Utf8 => {
make_scalar_function(array_expressions::string_to_array::<i32>)(args)
}
DataType::LargeUtf8 => {
make_scalar_function(array_expressions::string_to_array::<i64>)(args)
}
other => {
internal_err!(
"Unsupported data type {other:?} for function string_to_array"
)
}
})
}
BuiltinScalarFunction::StartsWith => Arc::new(|args| match args[0].data_type() {
DataType::Utf8 => {
make_scalar_function(string_expressions::starts_with::<i32>)(args)
Expand Down
1 change: 1 addition & 0 deletions datafusion/proto/proto/datafusion.proto
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,7 @@ enum ScalarFunction {
Iszero = 114;
ArrayEmpty = 115;
ArrayPopBack = 116;
StringToArray = 117;
}

message ScalarFunctionNode {
Expand Down
3 changes: 3 additions & 0 deletions datafusion/proto/src/generated/pbjson.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions datafusion/proto/src/generated/prost.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion/proto/src/logical_plan/from_proto.rs
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction {
ScalarFunction::Right => Self::Right,
ScalarFunction::Rpad => Self::Rpad,
ScalarFunction::SplitPart => Self::SplitPart,
ScalarFunction::StringToArray => Self::StringToArray,
ScalarFunction::StartsWith => Self::StartsWith,
ScalarFunction::Strpos => Self::Strpos,
ScalarFunction::Substr => Self::Substr,
Expand Down
1 change: 1 addition & 0 deletions datafusion/proto/src/logical_plan/to_proto.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1518,6 +1518,7 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction {
BuiltinScalarFunction::Right => Self::Right,
BuiltinScalarFunction::Rpad => Self::Rpad,
BuiltinScalarFunction::SplitPart => Self::SplitPart,
BuiltinScalarFunction::StringToArray => Self::StringToArray,
BuiltinScalarFunction::StartsWith => Self::StartsWith,
BuiltinScalarFunction::Strpos => Self::Strpos,
BuiltinScalarFunction::Substr => Self::Substr,
Expand Down
46 changes: 46 additions & 0 deletions datafusion/sqllogictest/test_files/array.slt
Original file line number Diff line number Diff line change
Expand Up @@ -2485,6 +2485,52 @@ NULL
false
false

query ?
SELECT string_to_array('abcxxxdef', 'xxx')
----
[abc, def]

query ?
SELECT string_to_array('abc', '')
----
[abc]

query ?
SELECT string_to_array('abc', NULL)
----
[a, b, c]

query ?
SELECT string_to_array('abc def', ' ', 'def')
----
[abc, ]

query ?
select string_to_array(e, ',') from values;
----
[Lorem]
[ipsum]
[dolor]
[sit]
[amet]
[, ]
[consectetur]
[adipiscing]
NULL

query ?
select string_to_list(e, 'm') from values;
----
[Lore, ]
[ipsu, ]
[dolor]
[sit]
[a, et]
[,]
[consectetur]
[adipiscing]
NULL

### Delete tables

statement ok
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/functions.slt
Original file line number Diff line number Diff line change
Expand Up @@ -810,4 +810,4 @@ SELECT products.* REPLACE (price*2 AS price, product_id+1000 AS product_id) FROM
1001 OldBrand Product 1 39.98
1002 OldBrand Product 2 59.98
1003 OldBrand Product 3 79.98
1004 OldBrand Product 4 99.98
1004 OldBrand Product 4 99.98
24 changes: 24 additions & 0 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -1523,6 +1523,8 @@ from_unixtime(expression)
- [list_to_string](#list_to_string)
- [make_array](#make_array)
- [make_list](#make_list)
- [string_to_array](#string_to_array)
- [string_to_list](#string_to_list)
- [trim_array](#trim_array)

### `array_append`
Expand Down Expand Up @@ -2369,6 +2371,28 @@ make_array(expression1[, ..., expression_n])

_Alias of [make_array](#make_array)._

### `string_to_array`

Splits a string in to an array of substrings based on a delimiter. Any substrings matching the optional `null_str` argument are replaced with NULL.

```
starts_with(str, delimiter[, null_str])
```

#### Arguments

- **str**: String expression to split.
- **delimiter**: Delimiter string to split on.
- **null_str**: Substring values to be replaced with `NULL`

#### Aliases

- string_to_list

### `string_to_list`

_Alias of [string_to_array](#string_to_array)._

### `trim_array`

Removes the last n elements from the array.
Expand Down

0 comments on commit 81f33b0

Please sign in to comment.