Skip to content

Commit

Permalink
Add documentation about type signatures, and export `TIMEZONE_WILDCAR…
Browse files Browse the repository at this point in the history
…D` (#7726)

* Add documentation and export `TIMEZONE_WILDCARD`

* improve example

* Apply suggestions from code review

Co-authored-by: Yongting You <[email protected]>

---------

Co-authored-by: Yongting You <[email protected]>
  • Loading branch information
alamb and 2010YOUY01 committed Oct 7, 2023
1 parent c0409a7 commit 0809f45
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 62 deletions.
40 changes: 16 additions & 24 deletions datafusion/expr/src/built_in_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
//! Built-in functions module contains all the built-in functions definitions.

use crate::nullif::SUPPORTED_NULLIF_TYPES;
use crate::type_coercion::functions::{data_types, TIMEZONE_PLACEHOLDER};
use crate::signature::TIMEZONE_WILDCARD;
use crate::type_coercion::functions::data_types;
use crate::{
conditional_expressions, struct_expressions, utils, Signature, TypeSignature,
Volatility,
Expand Down Expand Up @@ -1029,22 +1030,22 @@ impl BuiltinScalarFunction {
Exact(vec![Utf8, Timestamp(Nanosecond, None)]),
Exact(vec![
Utf8,
Timestamp(Nanosecond, Some(TIMEZONE_PLACEHOLDER.into())),
Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())),
]),
Exact(vec![Utf8, Timestamp(Microsecond, None)]),
Exact(vec![
Utf8,
Timestamp(Microsecond, Some(TIMEZONE_PLACEHOLDER.into())),
Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())),
]),
Exact(vec![Utf8, Timestamp(Millisecond, None)]),
Exact(vec![
Utf8,
Timestamp(Millisecond, Some(TIMEZONE_PLACEHOLDER.into())),
Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())),
]),
Exact(vec![Utf8, Timestamp(Second, None)]),
Exact(vec![
Utf8,
Timestamp(Second, Some(TIMEZONE_PLACEHOLDER.into())),
Timestamp(Second, Some(TIMEZONE_WILDCARD.into())),
]),
],
self.volatility(),
Expand All @@ -1059,11 +1060,8 @@ impl BuiltinScalarFunction {
]),
Exact(vec![
Interval(MonthDayNano),
Timestamp(
array_type.clone(),
Some(TIMEZONE_PLACEHOLDER.into()),
),
Timestamp(Nanosecond, Some(TIMEZONE_PLACEHOLDER.into())),
Timestamp(array_type.clone(), Some(TIMEZONE_WILDCARD.into())),
Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())),
]),
Exact(vec![
Interval(DayTime),
Expand All @@ -1072,30 +1070,24 @@ impl BuiltinScalarFunction {
]),
Exact(vec![
Interval(DayTime),
Timestamp(
array_type.clone(),
Some(TIMEZONE_PLACEHOLDER.into()),
),
Timestamp(Nanosecond, Some(TIMEZONE_PLACEHOLDER.into())),
Timestamp(array_type.clone(), Some(TIMEZONE_WILDCARD.into())),
Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())),
]),
Exact(vec![
Interval(MonthDayNano),
Timestamp(array_type.clone(), None),
]),
Exact(vec![
Interval(MonthDayNano),
Timestamp(
array_type.clone(),
Some(TIMEZONE_PLACEHOLDER.into()),
),
Timestamp(array_type.clone(), Some(TIMEZONE_WILDCARD.into())),
]),
Exact(vec![
Interval(DayTime),
Timestamp(array_type.clone(), None),
]),
Exact(vec![
Interval(DayTime),
Timestamp(array_type, Some(TIMEZONE_PLACEHOLDER.into())),
Timestamp(array_type, Some(TIMEZONE_WILDCARD.into())),
]),
]
};
Expand All @@ -1115,22 +1107,22 @@ impl BuiltinScalarFunction {
Exact(vec![Utf8, Timestamp(Second, None)]),
Exact(vec![
Utf8,
Timestamp(Second, Some(TIMEZONE_PLACEHOLDER.into())),
Timestamp(Second, Some(TIMEZONE_WILDCARD.into())),
]),
Exact(vec![Utf8, Timestamp(Microsecond, None)]),
Exact(vec![
Utf8,
Timestamp(Microsecond, Some(TIMEZONE_PLACEHOLDER.into())),
Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())),
]),
Exact(vec![Utf8, Timestamp(Millisecond, None)]),
Exact(vec![
Utf8,
Timestamp(Millisecond, Some(TIMEZONE_PLACEHOLDER.into())),
Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())),
]),
Exact(vec![Utf8, Timestamp(Nanosecond, None)]),
Exact(vec![
Utf8,
Timestamp(Nanosecond, Some(TIMEZONE_PLACEHOLDER.into())),
Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())),
]),
],
self.volatility(),
Expand Down
2 changes: 1 addition & 1 deletion datafusion/expr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ pub use logical_plan::*;
pub use nullif::SUPPORTED_NULLIF_TYPES;
pub use operator::Operator;
pub use partition_evaluator::PartitionEvaluator;
pub use signature::{Signature, TypeSignature, Volatility};
pub use signature::{Signature, TypeSignature, Volatility, TIMEZONE_WILDCARD};
pub use table_source::{TableProviderFilterPushDown, TableSource, TableType};
pub use udaf::AggregateUDF;
pub use udf::ScalarUDF;
Expand Down
102 changes: 76 additions & 26 deletions datafusion/expr/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,35 +20,82 @@

use arrow::datatypes::DataType;

/// Constant that is used as a placeholder for any valid timezone.
/// This is used where a function can accept a timestamp type with any
/// valid timezone, it exists to avoid the need to enumerate all possible
/// timezones. See [`TypeSignature`] for more details.
///
/// Type coercion always ensures that functions will be executed using
/// timestamp arrays that have a valid time zone. Functions must never
/// return results with this timezone.
pub const TIMEZONE_WILDCARD: &str = "+TZ";

///A function's volatility, which defines the functions eligibility for certain optimizations
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
pub enum Volatility {
/// Immutable - An immutable function will always return the same output when given the same
/// input. An example of this is [super::BuiltinScalarFunction::Cos].
/// An immutable function will always return the same output when given the same
/// input. An example of this is [super::BuiltinScalarFunction::Cos]. DataFusion
/// will attempt to inline immutable functions during planning.
Immutable,
/// Stable - A stable function may return different values given the same input across different
/// A stable function may return different values given the same input across different
/// queries but must return the same value for a given input within a query. An example of
/// this is [super::BuiltinScalarFunction::Now].
/// this is [super::BuiltinScalarFunction::Now]. DataFusion
/// will attempt to inline `Stable` functions during planning, when possible.
/// For query `select col1, now() from t1`, it might take a while to execute but
/// `now()` column will be the same for each output row, which is evaluated
/// during planning.
Stable,
/// Volatile - A volatile function may change the return value from evaluation to evaluation.
/// A volatile function may change the return value from evaluation to evaluation.
/// Multiple invocations of a volatile function may return different results when used in the
/// same query. An example of this is [super::BuiltinScalarFunction::Random].
/// same query. An example of this is [super::BuiltinScalarFunction::Random]. DataFusion
/// can not evaluate such functions during planning.
/// In the query `select col1, random() from t1`, `random()` function will be evaluated
/// for each output row, resulting in a unique random value for each row.
Volatile,
}

/// A function's type signature, which defines the function's supported argument types.
/// A function's type signature defines the types of arguments the function supports.
///
/// Functions typically support only a few different types of arguments compared to the
/// different datatypes in Arrow. To make functions easy to use, when possible DataFusion
/// automatically coerces (add casts to) function arguments so they match the type signature.
///
/// For example, a function like `cos` may only be implemented for `Float64` arguments. To support a query
/// that calles `cos` with a different argument type, such as `cos(int_column)`, type coercion automatically
/// adds a cast such as `cos(CAST int_column AS DOUBLE)` during planning.
///
/// # Data Types
/// Types to match are represented using Arrow's [`DataType`]. [`DataType::Timestamp`] has an optional variable
/// timezone specification. To specify a function can handle a timestamp with *ANY* timezone, use
/// the [`TIMEZONE_WILDCARD`]. For example:
///
/// ```
/// # use arrow::datatypes::{DataType, TimeUnit};
/// # use datafusion_expr::{TIMEZONE_WILDCARD, TypeSignature};
/// let type_signature = TypeSignature::Exact(vec![
/// // A nanosecond precision timestamp with ANY timezone
/// // matches Timestamp(Nanosecond, Some("+0:00"))
/// // matches Timestamp(Nanosecond, Some("+5:00"))
/// // does not match Timestamp(Nanosecond, None)
/// DataType::Timestamp(TimeUnit::Nanosecond, Some(TIMEZONE_WILDCARD.into())),
/// ]);
/// ```
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum TypeSignature {
/// arbitrary number of arguments of an common type out of a list of valid types
// A function such as `concat` is `Variadic(vec![DataType::Utf8, DataType::LargeUtf8])`
/// arbitrary number of arguments of an common type out of a list of valid types.
///
/// # Examples
/// A function such as `concat` is `Variadic(vec![DataType::Utf8, DataType::LargeUtf8])`
Variadic(Vec<DataType>),
/// arbitrary number of arguments of an arbitrary but equal type
// A function such as `array` is `VariadicEqual`
// The first argument decides the type used for coercion
/// arbitrary number of arguments of an arbitrary but equal type.
/// DataFusion attempts to coerce all argument types to match the first argument's type
///
/// # Examples
/// A function such as `array` is `VariadicEqual`
VariadicEqual,
/// arbitrary number of arguments with arbitrary types
VariadicAny,
/// fixed number of arguments of an arbitrary but equal type out of a list of valid types
/// fixed number of arguments of an arbitrary but equal type out of a list of valid types.
///
/// # Examples
/// 1. A function of one argument of f64 is `Uniform(1, vec![DataType::Float64])`
Expand All @@ -58,7 +105,8 @@ pub enum TypeSignature {
Exact(Vec<DataType>),
/// fixed number of arguments of arbitrary types
Any(usize),
/// One of a list of signatures
/// Matches exactly one of a list of [`TypeSignature`]s. Coercion is attempted to match
/// the signatures in order, and stops after the first success, if any.
OneOf(Vec<TypeSignature>),
}

Expand Down Expand Up @@ -104,46 +152,48 @@ impl TypeSignature {
}
}

/// The signature of a function defines the supported argument types
/// and its volatility.
/// Defines the supported argument types ([`TypeSignature`]) and [`Volatility`] for a function.
///
/// DataFusion will automatically coerce (cast) argument types to one of the supported
/// function signatures, if possible.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Signature {
/// type_signature - The types that the function accepts. See [TypeSignature] for more information.
/// The data types that the function accepts. See [TypeSignature] for more information.
pub type_signature: TypeSignature,
/// volatility - The volatility of the function. See [Volatility] for more information.
/// The volatility of the function. See [Volatility] for more information.
pub volatility: Volatility,
}

impl Signature {
/// new - Creates a new Signature from any type signature and the volatility.
/// Creates a new Signature from a given type signature and volatility.
pub fn new(type_signature: TypeSignature, volatility: Volatility) -> Self {
Signature {
type_signature,
volatility,
}
}
/// variadic - Creates a variadic signature that represents an arbitrary number of arguments all from a type in common_types.
/// An arbitrary number of arguments with the same type, from those listed in `common_types`.
pub fn variadic(common_types: Vec<DataType>, volatility: Volatility) -> Self {
Self {
type_signature: TypeSignature::Variadic(common_types),
volatility,
}
}
/// variadic_equal - Creates a variadic signature that represents an arbitrary number of arguments of the same type.
/// An arbitrary number of arguments of the same type.
pub fn variadic_equal(volatility: Volatility) -> Self {
Self {
type_signature: TypeSignature::VariadicEqual,
volatility,
}
}
/// variadic_any - Creates a variadic signature that represents an arbitrary number of arguments of any type.
/// An arbitrary number of arguments of any type.
pub fn variadic_any(volatility: Volatility) -> Self {
Self {
type_signature: TypeSignature::VariadicAny,
volatility,
}
}
/// uniform - Creates a function with a fixed number of arguments of the same type, which must be from valid_types.
/// A fixed number of arguments of the same type, from those listed in `valid_types`.
pub fn uniform(
arg_count: usize,
valid_types: Vec<DataType>,
Expand All @@ -154,21 +204,21 @@ impl Signature {
volatility,
}
}
/// exact - Creates a signature which must match the types in exact_types in order.
/// Exactly matches the types in `exact_types`, in order.
pub fn exact(exact_types: Vec<DataType>, volatility: Volatility) -> Self {
Signature {
type_signature: TypeSignature::Exact(exact_types),
volatility,
}
}
/// any - Creates a signature which can a be made of any type but of a specified number
/// A specified number of arguments of any type
pub fn any(arg_count: usize, volatility: Volatility) -> Self {
Signature {
type_signature: TypeSignature::Any(arg_count),
volatility,
}
}
/// one_of Creates a signature which can match any of the [TypeSignature]s which are passed in.
/// Any one of a list of [TypeSignature]s.
pub fn one_of(type_signatures: Vec<TypeSignature>, volatility: Volatility) -> Self {
Signature {
type_signature: TypeSignature::OneOf(type_signatures),
Expand Down
13 changes: 2 additions & 11 deletions datafusion/expr/src/type_coercion/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,14 @@
// specific language governing permissions and limitations
// under the License.

use crate::signature::TIMEZONE_WILDCARD;
use crate::{Signature, TypeSignature};
use arrow::{
compute::can_cast_types,
datatypes::{DataType, TimeUnit},
};
use datafusion_common::{plan_err, DataFusionError, Result};

/// Constant that is used as a placeholder for any valid timezone.
/// This is used where a function can accept a timestamp type with any
/// valid timezone, it exists to avoid the need to enumerate all possible
/// timezones.
///
/// Type coercion always ensures that functions will be executed using
/// timestamp arrays that have a valid time zone. Functions must never
/// return results with this timezone.
pub(crate) const TIMEZONE_PLACEHOLDER: &str = "+TZ";

/// Performs type coercion for function arguments.
///
/// Returns the data types to which each argument must be coerced to
Expand Down Expand Up @@ -232,7 +223,7 @@ fn coerced_from<'a>(
Utf8 | LargeUtf8 => Some(type_into.clone()),
Null if can_cast_types(type_from, type_into) => Some(type_into.clone()),

Timestamp(unit, Some(tz)) if tz.as_ref() == TIMEZONE_PLACEHOLDER => {
Timestamp(unit, Some(tz)) if tz.as_ref() == TIMEZONE_WILDCARD => {
match type_from {
Timestamp(_, Some(from_tz)) => {
Some(Timestamp(unit.clone(), Some(from_tz.clone())))
Expand Down

0 comments on commit 0809f45

Please sign in to comment.