Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removes min/max/count comparison based on name in aggregate statistics #12296

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion datafusion/expr/src/udaf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ use std::vec;

use arrow::datatypes::{DataType, Field};

use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue};
use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue, Statistics};
use datafusion_physical_expr_common::physical_expr::PhysicalExpr;

use crate::expr::AggregateFunction;
use crate::function::{
Expand Down Expand Up @@ -262,6 +263,16 @@ impl AggregateUDF {
self.inner.is_descending()
}

pub fn value_from_stats(
&self,
statistics: &Statistics,
data_type: &DataType,
arguments: &[Arc<dyn PhysicalExpr>],
) -> Option<ScalarValue> {
self.inner
.value_from_stats(statistics, &data_type, arguments)
}

/// See [`AggregateUDFImpl::default_value`] for more details.
pub fn default_value(&self, data_type: &DataType) -> Result<ScalarValue> {
self.inner.default_value(data_type)
Expand Down Expand Up @@ -574,6 +585,15 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
fn is_descending(&self) -> Option<bool> {
None
}
// Return the value of the current UDF from the statistics
fn value_from_stats(
&self,
_statistics: &Statistics,
_data_type: &DataType,
_arguments: &[Arc<dyn PhysicalExpr>],
) -> Option<ScalarValue> {
None
}

/// Returns default value of the function given the input is all `null`.
///
Expand Down
32 changes: 32 additions & 0 deletions datafusion/functions-aggregate/src/count.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
// under the License.

use ahash::RandomState;
use datafusion_common::stats::Precision;
use datafusion_functions_aggregate_common::aggregate::count_distinct::BytesViewDistinctCountAccumulator;
use datafusion_physical_expr::expressions;
use std::collections::HashSet;
use std::ops::BitAnd;
use std::{fmt::Debug, sync::Arc};
Expand Down Expand Up @@ -54,6 +56,7 @@ use datafusion_functions_aggregate_common::aggregate::count_distinct::{
use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::accumulate_indices;
use datafusion_physical_expr_common::binary_map::OutputType;

use datafusion_common::utils::expr::COUNT_STAR_EXPANSION;
make_udaf_expr_and_func!(
Count,
count,
Expand Down Expand Up @@ -291,6 +294,35 @@ impl AggregateUDFImpl for Count {
fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
Ok(ScalarValue::Int64(Some(0)))
}

fn value_from_stats(
&self,
statistics: &datafusion_common::Statistics,
_data_type: &DataType,
arguments: &[Arc<dyn datafusion_physical_expr::PhysicalExpr>],
) -> Option<ScalarValue> {
if let Precision::Exact(num_rows) = statistics.num_rows {
if arguments.len() == 1 {
// TODO optimize with exprs other than Column
if let Some(col_expr) =
arguments[0].as_any().downcast_ref::<expressions::Column>()
{
let current_val =
&statistics.column_statistics[col_expr.index()].null_count;
if let &Precision::Exact(val) = current_val {
return Some(ScalarValue::Int64(Some((num_rows - val) as i64)));
}
} else if let Some(lit_expr) =
arguments[0].as_any().downcast_ref::<expressions::Literal>()
{
if lit_expr.value() == &COUNT_STAR_EXPANSION {
return Some(ScalarValue::Int64(Some(num_rows as i64)));
}
}
}
}
None
}
}

#[derive(Debug)]
Expand Down
86 changes: 85 additions & 1 deletion datafusion/functions-aggregate/src/min_max.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@ use arrow::datatypes::{
UInt8Type,
};
use arrow_schema::IntervalUnit;
use datafusion_common::stats::Precision;
use datafusion_common::{
downcast_value, exec_err, internal_err, DataFusionError, Result,
downcast_value, exec_err, internal_err, ColumnStatistics, DataFusionError, Result,
Statistics,
};
use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator;
use datafusion_physical_expr::{expressions, PhysicalExpr};
use std::fmt::Debug;
use std::sync::Arc;

use arrow::datatypes::i256;
use arrow::datatypes::{
Expand Down Expand Up @@ -147,6 +151,55 @@ macro_rules! instantiate_min_accumulator {
}};
}

trait FromColumnStatistics {
fn value_from_column_statistics(
&self,
stats: &ColumnStatistics,
) -> Option<ScalarValue>;

fn value_from_statistics(
&self,
statistics: &Statistics,
data_type: &DataType,
arguments: &[Arc<dyn PhysicalExpr>],
) -> Option<ScalarValue> {
if let Precision::Exact(num_rows) = &statistics.num_rows {
match *num_rows {
0 => return ScalarValue::try_from(data_type).ok(),
value if value > 0 => {
let col_stats = &statistics.column_statistics;
if arguments.len() == 1 {
// TODO optimize with exprs other than Column
if let Some(col_expr) =
arguments[0].as_any().downcast_ref::<expressions::Column>()
{
return self.value_from_column_statistics(
&col_stats[col_expr.index()],
);
}
}
}
_ => {}
}
}
None
}
}

impl FromColumnStatistics for Max {
fn value_from_column_statistics(
&self,
col_stats: &ColumnStatistics,
) -> Option<ScalarValue> {
if let Precision::Exact(ref val) = col_stats.max_value {
if !val.is_null() {
return Some(val.clone());
}
}
None
}
}

impl AggregateUDFImpl for Max {
fn as_any(&self) -> &dyn std::any::Any {
self
Expand Down Expand Up @@ -272,6 +325,7 @@ impl AggregateUDFImpl for Max {
fn is_descending(&self) -> Option<bool> {
Some(true)
}

fn order_sensitivity(&self) -> datafusion_expr::utils::AggregateOrderSensitivity {
datafusion_expr::utils::AggregateOrderSensitivity::Insensitive
}
Expand All @@ -282,6 +336,14 @@ impl AggregateUDFImpl for Max {
fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF {
datafusion_expr::ReversedUDAF::Identical
}
fn value_from_stats(
&self,
statistics: &Statistics,
data_type: &DataType,
arguments: &[Arc<dyn PhysicalExpr>],
) -> Option<ScalarValue> {
self.value_from_statistics(statistics, data_type, arguments)
}
}

// Statically-typed version of min/max(array) -> ScalarValue for string types
Expand Down Expand Up @@ -926,6 +988,20 @@ impl Default for Min {
}
}

impl FromColumnStatistics for Min {
fn value_from_column_statistics(
&self,
col_stats: &ColumnStatistics,
) -> Option<ScalarValue> {
if let Precision::Exact(ref val) = col_stats.min_value {
if !val.is_null() {
return Some(val.clone());
}
}
None
}
}

impl AggregateUDFImpl for Min {
fn as_any(&self) -> &dyn std::any::Any {
self
Expand Down Expand Up @@ -1052,6 +1128,14 @@ impl AggregateUDFImpl for Min {
Some(false)
}

fn value_from_stats(
&self,
statistics: &Statistics,
data_type: &DataType,
arguments: &[Arc<dyn PhysicalExpr>],
) -> Option<ScalarValue> {
self.value_from_statistics(statistics, data_type, arguments)
}
fn order_sensitivity(&self) -> datafusion_expr::utils::AggregateOrderSensitivity {
datafusion_expr::utils::AggregateOrderSensitivity::Insensitive
}
Expand Down
Loading
Loading