Skip to content

Commit

Permalink
perf(expression-construction): speed up .describe() expression cons…
Browse files Browse the repository at this point in the history
…truction
  • Loading branch information
cpcloud committed Jul 24, 2024
1 parent afc6b8b commit 0e97036
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 30 deletions.
2 changes: 1 addition & 1 deletion ibis/expr/operations/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ class Aggregate(Relation):

parent: Relation
groups: FrozenOrderedDict[str, Unaliased[Value]]
metrics: FrozenOrderedDict[str, Unaliased[Scalar]]
metrics: FrozenOrderedDict[str, Unaliased[Value]]

def __init__(self, parent, groups, metrics):
_check_integrity(groups.values(), {parent})
Expand Down
66 changes: 37 additions & 29 deletions ibis/expr/types/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import ibis.expr.datatypes as dt
import ibis.expr.operations as ops
import ibis.expr.schema as sch
from ibis import null, util
from ibis import util
from ibis.common.deferred import Deferred, Resolver
from ibis.common.selectors import Selector
from ibis.expr.rewrites import DerefMap
Expand Down Expand Up @@ -2953,7 +2953,7 @@ def info(self) -> Table:
aggs["null_frac"].append(isna.mean())
aggs["pos"].append(lit(pos, type=dt.int16))

return self.select(
return self.agg(
**dict(
zip(aggs.keys(), map(ArrayValue.unnest, map(ibis.array, aggs.values())))
)
Expand Down Expand Up @@ -3028,33 +3028,37 @@ def describe(
import ibis.selectors as s
from ibis import literal as lit
from ibis import null
from ibis.expr.types.arrays import ArrayValue

quantile = sorted(quantile)
aggs = []
aggs = defaultdict(list)

string_col = False
numeric_col = False

quantile_keys = tuple(
f"p{100 * q:.6f}".rstrip("0").rstrip(".") for q in quantile
)
default_quantiles = dict.fromkeys(quantile_keys, null(dt.float64))

for pos, colname in enumerate(self.columns):
col = self[colname]
typ = col.type()

# default statistics to None
col_mean = col_std = col_min = col_max = null(dt.float64)
col_mode = null(dt.string)
quantile_values = {
f"p{100 * q:.6f}".rstrip("0").rstrip("."): null(dt.float64)
for q in quantile
}
quantile_values = default_quantiles.copy()

if typ.is_numeric():
numeric_col = True
col_mean = col.mean()
col_std = col.std()
col_min = col.min().cast(dt.float64)
col_max = col.max().cast(dt.float64)
quantile_values = {
f"p{100 * q:.6f}".rstrip("0").rstrip("."): col.quantile(q)
for q in quantile
}
col_min = col.min()
col_max = col.max()
for key, q in zip(quantile_keys, quantile):
quantile_values[key] = col.quantile(q)

elif typ.is_string():
string_col = True
col_mode = col.mode()
Expand All @@ -3065,23 +3069,27 @@ def describe(
# Will not calculate statistics for other types
continue

agg = self.agg(
name=lit(colname),
pos=lit(pos, type=dt.int16),
type=lit(str(typ)),
count=col.isnull().count(),
nulls=col.isnull().sum(),
unique=col.nunique(),
mode=col_mode,
mean=col_mean,
std=col_std,
min=col_min,
**quantile_values,
max=col_max,
)
aggs.append(agg)
aggs["name"].append(colname)
aggs["pos"].append(lit(pos, type=dt.int16))
aggs["type"].append(lit(str(typ)))
aggs["count"].append(col.count())
aggs["nulls"].append(col.isnull().sum())
aggs["unique"].append(col.nunique())
aggs["mode"].append(col_mode)
aggs["mean"].append(col_mean)
aggs["std"].append(col_std)
aggs["min"].append(col_min)

for q, val in quantile_values.items():
aggs[q].append(val)

t = ibis.union(*aggs)
aggs["max"].append(col_max)

t = self.agg(
dict(
zip(aggs.keys(), map(ArrayValue.unnest, map(ibis.array, aggs.values())))
)
)

# TODO(jiting): Need a better way to remove columns with all NULL
if string_col and not numeric_col:
Expand Down

0 comments on commit 0e97036

Please sign in to comment.