Skip to content

Commit

Permalink
feat: join selectivity (#145)
Browse files Browse the repository at this point in the history
**Summary**: Implemented join selectivity formulas for inner joins,
left/right outer joins, and cross joins. Also properly accounts for
filters in the join condition.

**Demo**:
We now match Postgres on our median Q-error. See #127 for more details
on what queries this PR affected.
![Screenshot 2024-03-31 at 13 13
48](https://github.com/cmu-db/optd/assets/20631215/fae590a6-8c55-4016-b924-c697a1c25070)

**Details**:
* We only consider equality checks on columns of different tables to be
"join on conditions".
* Join selectivity formulas are from [Rogov
2022](https://postgrespro.com/blog/pgsql/5969618).
* If there are multiple on conditions, we multiply their selectivities
together.
  • Loading branch information
wangpatrick57 committed Mar 31, 2024
1 parent ee080d8 commit 68666c4
Show file tree
Hide file tree
Showing 7 changed files with 1,181 additions and 191 deletions.
4 changes: 4 additions & 0 deletions optd-core/src/cascades/optimizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,10 @@ impl<T: RelNodeTyp> CascadesOptimizer<T> {
self.memo.merge_group(group_a, group_b);
}

/// Get the properties of a Cascades group
/// P is the type of the property you expect
/// idx is the idx of the property you want. The order of properties is defined
/// by the property_builders parameter in CascadesOptimizer::new()
pub fn get_property_by_group<P: PropertyBuilder<T>>(
&self,
group_id: GroupId,
Expand Down
1,331 changes: 1,158 additions & 173 deletions optd-datafusion-repr/src/cost/base_cost.rs

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions optd-datafusion-repr/src/plan_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ pub use sort::{LogicalSort, PhysicalSort};

use crate::properties::schema::{Schema, SchemaPropertyBuilder};

/// OptRelNodeTyp FAQ:
/// - The define_plan_node!() macro defines what the children of each join node are
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum OptRelNodeTyp {
Placeholder(GroupId),
Expand Down
4 changes: 2 additions & 2 deletions optd-perftest/src/cardtest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,14 @@ pub trait CardtestRunnerDBMSHelper {

pub async fn cardtest<P: AsRef<Path>>(
workspace_dpath: P,
no_cached_optd_stats: bool,
rebuild_cached_optd_stats: bool,
pguser: &str,
pgpassword: &str,
tpch_config: TpchConfig,
) -> anyhow::Result<HashMap<String, Vec<Cardinfo>>> {
let pg_dbms = Box::new(PostgresDBMS::build(&workspace_dpath, pguser, pgpassword)?);
let truecard_getter = pg_dbms.clone();
let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, no_cached_optd_stats).await?);
let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, rebuild_cached_optd_stats).await?);
let dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>> = vec![pg_dbms, df_dbms];

let tpch_benchmark = Benchmark::Tpch(tpch_config.clone());
Expand Down
21 changes: 10 additions & 11 deletions optd-perftest/src/datafusion_dbms.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ use regex::Regex;

pub struct DatafusionDBMS {
workspace_dpath: PathBuf,
no_cached_stats: bool,
rebuild_cached_stats: bool,
ctx: SessionContext,
}

Expand Down Expand Up @@ -63,11 +63,11 @@ impl CardtestRunnerDBMSHelper for DatafusionDBMS {
impl DatafusionDBMS {
pub async fn new<P: AsRef<Path>>(
workspace_dpath: P,
no_cached_stats: bool,
rebuild_cached_stats: bool,
) -> anyhow::Result<Self> {
Ok(DatafusionDBMS {
workspace_dpath: workspace_dpath.as_ref().to_path_buf(),
no_cached_stats,
rebuild_cached_stats,
ctx: Self::new_session_ctx(None).await?,
})
}
Expand Down Expand Up @@ -145,13 +145,13 @@ impl DatafusionDBMS {

let mut estcards = vec![];
for (query_id, sql_fpath) in tpch_kit.get_sql_fpath_ordered_iter(tpch_config)? {
let sql = fs::read_to_string(sql_fpath)?;
let estcard = self.eval_query_estcard(&sql).await?;
estcards.push(estcard);
println!(
"done evaluating datafusion's estcard for TPC-H Q{}",
"about to evaluate datafusion's estcard for TPC-H Q{}",
query_id
);
let sql = fs::read_to_string(sql_fpath)?;
let estcard = self.eval_query_estcard(&sql).await?;
estcards.push(estcard);
}

Ok(estcards)
Expand Down Expand Up @@ -213,7 +213,7 @@ impl DatafusionDBMS {
.workspace_dpath
.join("datafusion_stats_caches")
.join(format!("{}.json", benchmark_fname));
if !self.no_cached_stats && stats_cache_fpath.exists() {
if !self.rebuild_cached_stats && stats_cache_fpath.exists() {
let file = File::open(&stats_cache_fpath)?;
Ok(serde_json::from_reader(file)?)
} else {
Expand All @@ -222,9 +222,8 @@ impl DatafusionDBMS {
_ => unimplemented!(),
};

// regardless of whether self.no_cached_stats is true or false, we want to update the cache
// this way, even if we choose not to read from the cache, the cache still always has the
// most up to date version of the stats
// When self.rebuild_cached_stats is true, we *don't read* from the cache but we still
// *do write* to the cache.
fs::create_dir_all(stats_cache_fpath.parent().unwrap())?;
let file = File::create(&stats_cache_fpath)?;
serde_json::to_writer(file, &base_table_stats)?;
Expand Down
8 changes: 4 additions & 4 deletions optd-perftest/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ enum Commands {
#[clap(long)]
#[clap(action)]
#[clap(help = "Whether to use the cached optd stats/cache generated stats")]
// this is an option because you want to make it false whenever you update the
// this is an option because you want to make it true whenever you update the
// code for how stats are generated in optd, in order to not use cached stats
// I found that I almost always want to use the cache though, which is why the
// system will use the cache by default
no_cached_optd_stats: bool,
rebuild_cached_optd_stats: bool,

#[clap(long)]
#[clap(default_value = "default_user")]
Expand Down Expand Up @@ -77,7 +77,7 @@ async fn main() -> anyhow::Result<()> {
scale_factor,
seed,
query_ids,
no_cached_optd_stats,
rebuild_cached_optd_stats,
pguser,
pgpassword,
} => {
Expand All @@ -89,7 +89,7 @@ async fn main() -> anyhow::Result<()> {
};
let cardinfo_alldbs = cardtest::cardtest(
&workspace_dpath,
no_cached_optd_stats,
rebuild_cached_optd_stats,
&pguser,
&pgpassword,
tpch_config,
Expand Down
2 changes: 1 addition & 1 deletion optd-perftest/tests/cardtest_integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ mod tests {
// make sure scale factor is low so the test runs fast
"--scale-factor",
"0.01",
"--no-cached-optd-stats",
"--rebuild-cached-optd-stats",
"--pguser",
"test_user",
"--pgpassword",
Expand Down

0 comments on commit 68666c4

Please sign in to comment.