diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f520db66eced..7f892ac3dd28 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -130,15 +130,15 @@ jobs: cp ./target/${{ matrix.target }}/${{ env.BUILD_PROFILE }}/databend-* ./target/${{ env.BUILD_PROFILE }}/ bash ./scripts/ci/ci-run-sqllogic-tests.sh base - build_udf: - runs-on: [self-hosted, "${{ matrix.runner }}", Linux, 16c32g, aws] + build_musl: + runs-on: [self-hosted, X64, Linux, 16c32g, aws] needs: create_release strategy: fail-fast: false matrix: - include: - - { target: x86_64-unknown-linux-gnu, runner: X64 } - - { target: aarch64-unknown-linux-gnu, runner: ARM64 } + target: + - x86_64-unknown-linux-musl + - aarch64-unknown-linux-musl steps: - name: Checkout uses: actions/checkout@v4 @@ -152,11 +152,9 @@ jobs: with: sha: ${{ github.sha }} target: ${{ matrix.target }} - artifacts: sqllogictests,sqlsmith,metactl,meta,query - features: python-udf - category: udf + artifacts: query,meta,metactl - build_hdfs: + build_udf: runs-on: [self-hosted, "${{ matrix.runner }}", Linux, 16c32g, aws] needs: create_release strategy: @@ -179,28 +177,24 @@ jobs: sha: ${{ github.sha }} target: ${{ matrix.target }} artifacts: sqllogictests,sqlsmith,metactl,meta,query - features: storage-hdfs - category: hdfs + features: python-udf + category: udf publish: - runs-on: [self-hosted, "${{ matrix.runner }}", Linux, 4c8g, aws] - needs: [create_release, build_default, build_hdfs] + runs-on: [self-hosted, X64, Linux, 4c8g, aws] + needs: [create_release, build_default, build_musl] strategy: fail-fast: false matrix: include: - category: default target: x86_64-unknown-linux-gnu - runner: X64 - category: default target: aarch64-unknown-linux-gnu - runner: ARM64 - - category: hdfs - target: x86_64-unknown-linux-gnu - runner: X64 - - category: hdfs - target: aarch64-unknown-linux-gnu - runner: ARM64 + - category: default + target: x86_64-unknown-linux-musl + - category: default + target: aarch64-unknown-linux-musl steps: - name: Checkout uses: actions/checkout@v4 diff --git a/Cargo.lock b/Cargo.lock index 2cae8952e576..2388d322e6ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3756,6 +3756,7 @@ dependencies = [ "chrono-tz 0.8.6", "cron", "databend-common-ast", + "databend-common-base", "databend-common-building", "databend-common-exception", "databend-common-expression", @@ -4327,6 +4328,7 @@ dependencies = [ "ahash 0.8.11", "arrow", "arrow-array", + "arrow-ipc", "async-backtrace", "async-channel 1.9.0", "async-trait", @@ -4378,6 +4380,7 @@ dependencies = [ "siphasher", "sys-info", "tantivy", + "tantivy-fst", "tantivy-jieba", "thrift", "typetag", @@ -4745,6 +4748,7 @@ dependencies = [ name = "databend-common-tracing" version = "0.1.0" dependencies = [ + "anyhow", "backtrace", "chrono", "color-backtrace", @@ -4761,6 +4765,7 @@ dependencies = [ "opentelemetry-otlp", "opentelemetry_sdk", "serde", + "serde_json", "strip-ansi-escapes", "tonic 0.11.0", ] @@ -5418,6 +5423,7 @@ dependencies = [ "databend-storages-common-table-meta", "fastrace 0.7.2", "jsonb", + "levenshtein_automata", "log", "match-template", "parquet", @@ -5426,6 +5432,7 @@ dependencies = [ "serde_json", "tantivy", "tantivy-common", + "tantivy-fst", "thiserror", "xorfilter-rs", ] @@ -10018,8 +10025,9 @@ dependencies = [ [[package]] name = "logforth" -version = "0.11.0" -source = "git+http://github.com/andylokandy/logforth?rev=0ca61ca#0ca61ca0fa3c87b5af5a08aa0354d96604e685c0" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "633080680671612565f637d1e33c5bcb7d58fb12c7d658baa166a03487265e80" dependencies = [ "anyhow", "colored", @@ -11386,8 +11394,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3a059efb063b8f425b948e042e6b9bd85edfe60e913630ed727b23e2dfcc558" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "stable_deref_trait", ] @@ -15099,8 +15106,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8d0582f186c0a6d55655d24543f15e43607299425c5ad8352c242b914b31856" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "aho-corasick", "arc-swap", @@ -15150,8 +15156,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "284899c2325d6832203ac6ff5891b297fc5239c3dc754c5bc1977855b23c10df" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "bitpacking 0.9.2", ] @@ -15159,8 +15164,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12722224ffbe346c7fec3275c699e508fd0d4710e629e933d5736ec524a1f44e" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "downcast-rs", "fastdivide", @@ -15175,8 +15179,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8019e3cabcfd20a1380b491e13ff42f57bb38bf97c3d5fa5c07e50816e0621f4" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "async-trait", "byteorder", @@ -15199,8 +15202,7 @@ dependencies = [ [[package]] name = "tantivy-jieba" version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f2fe65c125f0d76d06f0f2ce9fbb9287b53f0dafb51a6270d984a840e2f16c1" +source = "git+https://github.com/b41sh/tantivy-jieba?rev=af84361#af843610bc3bea826329af07256598c413f0dd6a" dependencies = [ "jieba-rs", "lazy_static", @@ -15210,8 +15212,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "847434d4af57b32e309f4ab1b4f1707a6c566656264caa427ff4285c4d9d0b82" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "nom", ] @@ -15219,8 +15220,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c69578242e8e9fc989119f522ba5b49a38ac20f576fc778035b96cc94f41f98e" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "tantivy-bitpacker", "tantivy-common", @@ -15231,8 +15231,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c56d6ff5591fc332739b3ce7035b57995a3ce29a93ffd6012660e0949c956ea8" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "murmurhash32", "rand_distr", @@ -15242,8 +15241,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0dcade25819a89cfe6f17d932c9cedff11989936bf6dd4f336d50392053b04" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "serde", ] diff --git a/Cargo.toml b/Cargo.toml index 76b0ba372dbc..d6e92d5e51f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -310,7 +310,7 @@ serde_with = { version = "3.8.1" } serfig = "0.1.0" sled = { version = "0.34", default-features = false } stream-more = "0.1.3" -tantivy = "0.22.0" +tantivy = { git = "https://github.com/b41sh/tantivy", rev = "37aeac0" } thiserror = { version = "1" } tikv-jemalloc-ctl = { version = "0.5.0", features = ["use_std"] } tokio = { version = "1.35.0", features = ["full"] } diff --git a/scripts/ci/deploy/config/databend-query-node-otlp-logs.toml b/scripts/ci/deploy/config/databend-query-node-otlp-logs.toml index 3b9df9113073..ad9abf5cdd29 100644 --- a/scripts/ci/deploy/config/databend-query-node-otlp-logs.toml +++ b/scripts/ci/deploy/config/databend-query-node-otlp-logs.toml @@ -52,14 +52,14 @@ dir = "./.databend/logs_1" [log.query] on = true -otlp_endpoint = "http://127.0.0.1:4317/v1/logs" +otlp_endpoint = "http://127.0.0.1:4317" [log.query.otlp_labels] qkey1 = "qvalue1" qkey2 = "qvalue2" [log.profile] on = true -otlp_endpoint = "http://127.0.0.1:4318/v1/logs" +otlp_endpoint = "http://127.0.0.1:4318" otlp_protocol = "http" [log.profile.otlp_labels] pkey1 = "pvalue1" diff --git a/src/common/io/src/constants.rs b/src/common/io/src/constants.rs index a9c8fbf65de6..de647fb0c8ad 100644 --- a/src/common/io/src/constants.rs +++ b/src/common/io/src/constants.rs @@ -32,6 +32,8 @@ pub const DEFAULT_BLOCK_INDEX_BUFFER_SIZE: usize = 300 * 1024; pub const DEFAULT_BLOCK_MAX_ROWS: usize = 1000 * 1000; // The min number of a block by default. pub const DEFAULT_BLOCK_MIN_ROWS: usize = 800 * 1000; +/// The number of bytes read at the end of the file on first read +pub const DEFAULT_FOOTER_READ_SIZE: u64 = 64 * 1024; // The min values of table option data_retention_period_in_hours pub const DEFAULT_MIN_TABLE_LEVEL_DATA_RETENTION_PERIOD_IN_HOURS: u64 = 1; diff --git a/src/common/tracing/Cargo.toml b/src/common/tracing/Cargo.toml index bf3b1cd70ce8..58ebe2903dbf 100644 --- a/src/common/tracing/Cargo.toml +++ b/src/common/tracing/Cargo.toml @@ -11,6 +11,7 @@ doctest = false test = true [dependencies] +anyhow = { workspace = true } backtrace = { workspace = true } chrono = { workspace = true } color-backtrace = { version = "0.6" } @@ -22,7 +23,7 @@ fastrace-opentelemetry = { workspace = true } itertools = { workspace = true } libc = "0.2.153" log = { workspace = true } -logforth = { version = "0.11", git = "http://github.com/andylokandy/logforth", rev = "0ca61ca", features = [ +logforth = { version = "0.12", features = [ 'json', 'rolling_file', 'opentelemetry', @@ -32,6 +33,7 @@ opentelemetry = { workspace = true } opentelemetry-otlp = { workspace = true } opentelemetry_sdk = { workspace = true } serde = { workspace = true } +serde_json = { workspace = true } strip-ansi-escapes = "0.2" tonic = { workspace = true } diff --git a/src/common/tracing/src/init.rs b/src/common/tracing/src/init.rs index 6931df1d1ebf..93b5793dfdea 100644 --- a/src/common/tracing/src/init.rs +++ b/src/common/tracing/src/init.rs @@ -224,15 +224,19 @@ pub fn init_logging( let labels = labels .iter() .chain(&cfg.otlp.endpoint.labels) - .map(|(k, v)| (k.clone().into(), v.clone().into())) - .chain([("category".into(), "system".into())]); - let otel = logforth::append::OpentelemetryLog::new( + .map(|(k, v)| (Cow::from(k.clone()), Cow::from(v.clone()))) + .chain([(Cow::from("category"), Cow::from("system"))]); + let mut otel_builder = logforth::append::opentelemetry::OpentelemetryLogBuilder::new( log_name, - &cfg.otlp.endpoint.endpoint, - cfg.otlp.endpoint.protocol.into(), - labels, + format!("{}/v1/logs", &cfg.otlp.endpoint.endpoint), ) - .expect("initialize opentelemetry logger"); + .with_protocol(cfg.otlp.endpoint.protocol.into()); + for (k, v) in labels { + otel_builder = otel_builder.add_label(k, v); + } + let otel = otel_builder + .build() + .expect("initialize opentelemetry logger"); let dispatch = Dispatch::new() .filter(TargetFilter::level_for( "databend::log::query", @@ -290,7 +294,6 @@ pub fn init_logging( "databend::log::query", LevelFilter::Off, )) - .layout(get_layout(&cfg.file.format)) .append(query_log_file); logger = logger.dispatch(dispatch); } @@ -298,15 +301,19 @@ pub fn init_logging( let labels = labels .iter() .chain(&endpoint.labels) - .map(|(k, v)| (k.clone().into(), v.clone().into())) - .chain([("category".into(), "query".into())]); - let otel = logforth::append::OpentelemetryLog::new( + .map(|(k, v)| (Cow::from(k.clone()), Cow::from(v.clone()))) + .chain([(Cow::from("category"), Cow::from("query"))]); + let mut otel_builder = logforth::append::opentelemetry::OpentelemetryLogBuilder::new( log_name, - &endpoint.endpoint, - endpoint.protocol.into(), - labels, + format!("{}/v1/logs", &endpoint.endpoint), ) - .expect("initialize opentelemetry logger"); + .with_protocol(endpoint.protocol.into()); + for (k, v) in labels { + otel_builder = otel_builder.add_label(k, v); + } + let otel = otel_builder + .build() + .expect("initialize opentelemetry logger"); let dispatch = Dispatch::new() .filter(TargetFilter::level_for_not( "databend::log::query", @@ -329,7 +336,6 @@ pub fn init_logging( "databend::log::profile", LevelFilter::Off, )) - .layout(get_layout(&cfg.file.format)) .append(profile_log_file); logger = logger.dispatch(dispatch); } @@ -337,15 +343,19 @@ pub fn init_logging( let labels = labels .iter() .chain(&endpoint.labels) - .map(|(k, v)| (k.clone().into(), v.clone().into())) - .chain([("category".into(), "profile".into())]); - let otel = logforth::append::OpentelemetryLog::new( + .map(|(k, v)| (Cow::from(k.clone()), Cow::from(v.clone()))) + .chain([(Cow::from("category"), Cow::from("profile"))]); + let mut otel_builder = logforth::append::opentelemetry::OpentelemetryLogBuilder::new( log_name, - &endpoint.endpoint, - endpoint.protocol.into(), - labels, + format!("{}/v1/logs", &endpoint.endpoint), ) - .expect("initialize opentelemetry logger"); + .with_protocol(endpoint.protocol.into()); + for (k, v) in labels { + otel_builder = otel_builder.add_label(k, v); + } + let otel = otel_builder + .build() + .expect("initialize opentelemetry logger"); let dispatch = Dispatch::new() .filter(TargetFilter::level_for_not( "databend::log::profile", diff --git a/src/common/tracing/src/loggers.rs b/src/common/tracing/src/loggers.rs index 08ff7ed96aad..6285a438fb57 100644 --- a/src/common/tracing/src/loggers.rs +++ b/src/common/tracing/src/loggers.rs @@ -12,13 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::fmt::Arguments; +use std::path::Path; + +use databend_common_base::runtime::ThreadTracker; +use log::Record; use logforth::append::rolling_file::NonBlockingBuilder; use logforth::append::rolling_file::RollingFileWriter; use logforth::append::rolling_file::Rotation; use logforth::append::RollingFile; -use logforth::layout::JsonLayout; -use logforth::layout::TextLayout; +use logforth::layout::collect_kvs; +use logforth::layout::CustomLayout; +use logforth::layout::KvDisplay; use logforth::Layout; +use serde_json::Map; /// Create a `BufWriter` for a rolling file logger. pub(crate) fn new_rolling_file_appender( @@ -41,8 +48,86 @@ pub(crate) fn new_rolling_file_appender( pub fn get_layout(format: &str) -> Layout { match format { - "text" => TextLayout::default().into(), - "json" => JsonLayout::default().into(), + "text" => text_layout(), + "json" => json_layout(), _ => unimplemented!("file logging format {format} is not supported"), } } + +fn text_layout() -> Layout { + CustomLayout::new( + |record: &Record, f: &dyn Fn(Arguments) -> anyhow::Result<()>| { + match ThreadTracker::query_id() { + None => { + f(format_args!( + "{} {:>5} {}: {}:{} {}{}", + chrono::Local::now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true), + record.level(), + record.module_path().unwrap_or(""), + Path::new(record.file().unwrap_or_default()) + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or_default(), + record.line().unwrap_or(0), + record.args(), + KvDisplay::new(record.key_values()), + ))?; + } + Some(query_id) => { + f(format_args!( + "{} {} {:>5} {}: {}:{} {}{}", + query_id, + chrono::Local::now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true), + record.level(), + record.module_path().unwrap_or(""), + Path::new(record.file().unwrap_or_default()) + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or_default(), + record.line().unwrap_or(0), + record.args(), + KvDisplay::new(record.key_values()), + ))?; + } + } + + Ok(()) + }, + ) + .into() +} + +fn json_layout() -> Layout { + CustomLayout::new( + |record: &Record, f: &dyn Fn(Arguments) -> anyhow::Result<()>| { + let mut fields = Map::new(); + fields.insert("message".to_string(), format!("{}", record.args()).into()); + for (k, v) in collect_kvs(record.key_values()) { + fields.insert(k, v.into()); + } + + match ThreadTracker::query_id() { + None => { + f(format_args!( + r#"{{"timestamp":"{}","level":"{}","fields":{}}}"#, + chrono::Local::now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true), + record.level(), + serde_json::to_string(&fields).unwrap_or_default(), + ))?; + } + Some(query_id) => { + f(format_args!( + r#"{{"timestamp":"{}","level":"{}","query_id":"{}","fields":{}}}"#, + chrono::Local::now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true), + record.level(), + query_id, + serde_json::to_string(&fields).unwrap_or_default(), + ))?; + } + } + + Ok(()) + }, + ) + .into() +} diff --git a/src/meta/api/src/name_id_value_api.rs b/src/meta/api/src/name_id_value_api.rs index 8b705272f0f6..6309fe40330f 100644 --- a/src/meta/api/src/name_id_value_api.rs +++ b/src/meta/api/src/name_id_value_api.rs @@ -449,6 +449,7 @@ mod tests { updated_on: Default::default(), comment: "".to_string(), drop_on: None, + gc_in_progress: false, }; let v = db_meta(1).to_pb()?.encode_to_vec(); diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs index 0cb6ce446bd8..9017b709b3a7 100644 --- a/src/meta/api/src/schema_api_impl.rs +++ b/src/meta/api/src/schema_api_impl.rs @@ -24,6 +24,7 @@ use std::time::Duration; use chrono::DateTime; use chrono::Utc; use databend_common_base::base::uuid::Uuid; +use databend_common_base::display::display_slice::DisplaySliceExt; use databend_common_meta_app::app_error::AppError; use databend_common_meta_app::app_error::CommitTableMetaError; use databend_common_meta_app::app_error::CreateAsDropTableWithoutDropTime; @@ -157,7 +158,6 @@ use databend_common_meta_app::schema::UpdateMultiTableMetaResult; use databend_common_meta_app::schema::UpdateStreamMetaReq; use databend_common_meta_app::schema::UpdateTableMetaReply; use databend_common_meta_app::schema::UpdateVirtualColumnReq; -use databend_common_meta_app::schema::UpsertTableCopiedFileReq; use databend_common_meta_app::schema::UpsertTableOptionReply; use databend_common_meta_app::schema::UpsertTableOptionReq; use databend_common_meta_app::schema::VirtualColumnIdent; @@ -185,9 +185,11 @@ use databend_common_meta_types::TxnOp; use databend_common_meta_types::TxnRequest; use databend_common_proto_conv::FromToProto; use fastrace::func_name; +use futures::StreamExt; use futures::TryStreamExt; use log::debug; use log::error; +use log::info; use log::warn; use ConditionResult::Eq; @@ -219,6 +221,7 @@ use crate::util::deserialize_struct_get_response; use crate::util::get_table_by_id_or_err; use crate::util::list_tables_from_unshare_db; use crate::util::mget_pb_values; +use crate::util::txn_delete_exact; use crate::util::txn_op_put_pb; use crate::util::unknown_database_error; use crate::SchemaApi; @@ -1219,7 +1222,7 @@ impl + ?Sized> SchemaApi for KV { &req.name_ident.tenant, *id.data, *seq_db_id.data, - false, + true, false, &mut txn, ) @@ -2390,7 +2393,7 @@ impl + ?Sized> SchemaApi for KV { } = req; let mut tbl_seqs = HashMap::new(); - let mut txn_req = TxnRequest::default(); + let mut txn = TxnRequest::default(); let mut mismatched_tbs = vec![]; let tid_vec = update_table_metas .iter() @@ -2436,13 +2439,10 @@ impl + ?Sized> SchemaApi for KV { new_table_meta.shared_by = table_meta.shared_by.clone(); tbl_seqs.insert(req.0.table_id, *tb_meta_seq); - txn_req - .condition - .push(txn_cond_seq(&tbid, Eq, *tb_meta_seq)); - txn_req - .if_then + txn.condition.push(txn_cond_seq(&tbid, Eq, *tb_meta_seq)); + txn.if_then .push(txn_op_put(&tbid, serialize_struct(&new_table_meta)?)); - txn_req.else_then.push(TxnOp { + txn.else_then.push(TxnOp { request: Some(Request::Get(TxnGetRequest { key: tbid.to_string_key(), })), @@ -2450,16 +2450,35 @@ impl + ?Sized> SchemaApi for KV { new_table_meta_map.insert(req.0.table_id, new_table_meta); } - for (tbid, req) in copied_files { - let tbid = TableId { table_id: tbid }; - let (conditions, match_operations) = build_upsert_table_copied_file_info_conditions( - &tbid, - &req, - tbl_seqs[&tbid.table_id], - req.fail_if_duplicated, - )?; - txn_req.condition.extend(conditions); - txn_req.if_then.extend(match_operations) + + // `remove_table_copied_files` and `upsert_table_copied_file_info` + // all modify `TableCopiedFileInfo`, + // so there used to has `TableCopiedFileLockKey` in these two functions + // to protect TableCopiedFileInfo modification. + // In issue: https://github.com/datafuselabs/databend/issues/8897, + // there is chance that if copy files concurrently, `upsert_table_copied_file_info` + // may return `TxnRetryMaxTimes`. + // So now, in case that `TableCopiedFileInfo` has expire time, remove `TableCopiedFileLockKey` + // in each function. In this case there is chance that some `TableCopiedFileInfo` may not be + // removed in `remove_table_copied_files`, but these data can be purged in case of expire time. + + for (table_id, req) in copied_files { + let tbid = TableId { table_id }; + + let table_meta_seq = tbl_seqs[&tbid.table_id]; + txn.condition.push(txn_cond_eq_seq(&tbid, table_meta_seq)); + + for (file_name, file_info) in req.file_info { + let key = TableCopiedFileNameIdent { + table_id: tbid.table_id, + file: file_name, + }; + + if req.insert_if_not_exists { + txn.condition.push(txn_cond_eq_seq(&key, 0)); + } + txn.if_then.push(txn_op_put_pb(&key, &file_info, req.ttl)?) + } } let sid_vec = update_stream_metas @@ -2500,20 +2519,17 @@ impl + ?Sized> SchemaApi for KV { new_stream_meta.options = req.options.clone(); new_stream_meta.updated_on = Utc::now(); - txn_req - .condition + txn.condition .push(txn_cond_seq(&stream_id, Eq, stream_meta_seq)); - txn_req - .if_then + txn.if_then .push(txn_op_put(&stream_id, serialize_struct(&new_stream_meta)?)); } for deduplicated_label in deduplicated_labels { - txn_req - .if_then + txn.if_then .push(build_upsert_table_deduplicated_label(deduplicated_label)); } - let (succ, responses) = send_txn(self, txn_req).await?; + let (succ, responses) = send_txn(self, txn).await?; if succ { return Ok(Ok(UpdateTableMetaReply {})); } @@ -3357,9 +3373,9 @@ async fn construct_drop_table_txn_operations( return if if_exists { Ok((0, 0)) } else { - return Err(KVAppError::AppError(AppError::UnknownTable( + Err(KVAppError::AppError(AppError::UnknownTable( UnknownTable::new(tbname, "drop_table_by_id"), - ))); + ))) }; } @@ -3374,9 +3390,13 @@ async fn construct_drop_table_txn_operations( let mut tb_meta = tb_meta.unwrap(); // drop a table with drop_on time if tb_meta.drop_on.is_some() { - return Err(KVAppError::AppError(AppError::DropTableWithDropTime( - DropTableWithDropTime::new(&dbid_tbname.table_name), - ))); + return if if_exists { + Ok((0, 0)) + } else { + Err(KVAppError::AppError(AppError::DropTableWithDropTime( + DropTableWithDropTime::new(&dbid_tbname.table_name), + ))) + }; } tb_meta.drop_on = Some(Utc::now()); @@ -3521,47 +3541,69 @@ async fn drop_database_meta( Ok(*seq_db_id.data) } -/// remove copied files for a table. +/// Remove copied files for a dropped table. /// -/// Returns number of files that are going to be removed. -async fn remove_table_copied_files( +/// Dropped table can not be accessed by any query, +/// so it is safe to remove all the copied files in multiple sub transactions. +async fn remove_copied_files_for_dropped_table( kv_api: &(impl kvapi::KVApi + ?Sized), - table_id: u64, - txn: &mut TxnRequest, -) -> Result { - let mut n = 0; - let chunk_size = DEFAULT_MGET_SIZE; - - // `list_keys` list all the `TableCopiedFileNameIdent` of the table. - // But if a upsert_table_copied_file_info run concurrently, there is chance that - // `list_keys` may lack of some new inserted TableCopiedFileNameIdent. - // But since TableCopiedFileNameIdent has expire time, they can be purged by expire time. - let copied_files = list_table_copied_files(kv_api, table_id).await?; - - for chunk in copied_files.chunks(chunk_size) { - // Load the `seq` of every copied file - let seqs = { - let str_keys: Vec<_> = chunk.iter().map(|f| f.to_string_key()).collect(); + table_id: &TableId, +) -> Result<(), MetaError> { + let batch_size = 1024; + + // Loop until: + // - all cleaned + // - or table is removed from meta-service + // - or is no longer in `droppped` state. + for i in 0..usize::MAX { + let mut txn = TxnRequest::default(); + + let seq_meta = kv_api.get_pb(table_id).await?; + let Some(seq_table_meta) = seq_meta else { + return Ok(()); + }; - let seq_infos: Vec<(u64, Option)> = - mget_pb_values(kv_api, &str_keys).await?; + // TODO: enable this check. Currently when gc db, the table may not be dropped. + // if seq_table_meta.data.drop_on.is_none() { + // return Ok(()); + // } - seq_infos.into_iter().map(|(seq, _)| seq) + // Make sure the table meta is not changed, such as being un-dropped. + txn.condition + .push(txn_cond_eq_seq(table_id, seq_table_meta.seq)); + + let copied_file_ident = TableCopiedFileNameIdent { + table_id: table_id.table_id, + file: "dummy".to_string(), }; - for (copied_seq, copied_ident) in seqs.zip(chunk) { - if copied_seq == 0 { - continue; - } + let dir_name = DirName::new(copied_file_ident); - txn.condition - .push(txn_cond_seq(copied_ident, Eq, copied_seq)); + let key_stream = kv_api.list_pb_keys(&dir_name).await?; + let copied_files = key_stream.take(batch_size).try_collect::>().await?; + + if copied_files.is_empty() { + return Ok(()); + } + + for copied_ident in copied_files.iter() { + // It is a dropped table, thus there is no data will be written to the table. + // Therefore, we only need to assert the table_meta seq, and there is no need to assert + // seq of each copied file. txn.if_then.push(txn_op_del(copied_ident)); - n += 1; } - } - Ok(n) + info!( + "remove_copied_files_for_dropped_table {}: {}-th batch remove: {} items: {}", + table_id, + i, + copied_files.len(), + copied_files.display() + ); + + send_txn(kv_api, txn).await?; + } + unreachable!() } /// List the copied file identities belonging to a table. @@ -3685,46 +3727,6 @@ fn table_has_to_not_exist( } } -fn build_upsert_table_copied_file_info_conditions( - table_id: &TableId, - req: &UpsertTableCopiedFileReq, - tb_meta_seq: u64, - fail_if_duplicated: bool, -) -> Result<(Vec, Vec), KVAppError> { - let mut condition = vec![txn_cond_seq(table_id, Eq, tb_meta_seq)]; - let mut if_then = vec![]; - - // `remove_table_copied_files` and `upsert_table_copied_file_info` - // all modify `TableCopiedFileInfo`, - // so there used to has `TableCopiedFileLockKey` in these two functions - // to protect TableCopiedFileInfo modification. - // In issue: https://github.com/datafuselabs/databend/issues/8897, - // there is chance that if copy files concurrently, `upsert_table_copied_file_info` - // may return `TxnRetryMaxTimes`. - // So now, in case that `TableCopiedFileInfo` has expire time, remove `TableCopiedFileLockKey` - // in each function. In this case there is chance that some `TableCopiedFileInfo` may not be - // removed in `remove_table_copied_files`, but these data can be purged in case of expire time. - - let file_name_infos = req.file_info.clone().into_iter(); - - for (file_name, file_info) in file_name_infos { - let key = TableCopiedFileNameIdent { - table_id: table_id.table_id, - file: file_name.to_owned(), - }; - if fail_if_duplicated { - // "fail_if_duplicated" mode, assumes files are absent - condition.push(txn_cond_seq(&key, Eq, 0)); - } - if_then.push(TxnOp::put_with_ttl( - key.to_string_key(), - serialize_struct(&file_info)?, - req.ttl, - )) - } - Ok((condition, if_then)) -} - fn build_upsert_table_deduplicated_label(deduplicated_label: String) -> TxnOp { TxnOp::put_with_ttl( deduplicated_label, @@ -3994,8 +3996,11 @@ async fn gc_dropped_db_by_id( }; for tb_id in tb_id_list.id_list { - gc_dropped_table_data(kv_api, tb_id, &mut txn).await?; - gc_dropped_table_index(kv_api, tenant, tb_id, &mut txn).await?; + let table_id_ident = TableId { table_id: tb_id }; + remove_copied_files_for_dropped_table(kv_api, &table_id_ident).await?; + remove_data_for_dropped_table(kv_api, &table_id_ident, &mut txn).await?; + remove_index_for_dropped_table(kv_api, tenant, &table_id_ident, &mut txn) + .await?; } let id_key = iter.next().unwrap(); @@ -4040,100 +4045,103 @@ async fn gc_dropped_table_by_id( table_name: String, ) -> Result<(), KVAppError> { // first get TableIdList - let dbid_tbname_idlist = TableIdHistoryIdent { + let table_id_history_ident = TableIdHistoryIdent { database_id: db_id, table_name, }; - let (tb_id_list_seq, tb_id_list_opt): (_, Option) = - get_pb_value(kv_api, &dbid_tbname_idlist).await?; - let mut tb_id_list = match tb_id_list_opt { - Some(list) => list, - None => return Ok(()), + + let seq_id_list = kv_api.get_pb(&table_id_history_ident).await?; + + let Some(seq_id_list) = seq_id_list else { + return Ok(()); }; - for (i, tb_id) in tb_id_list.id_list.iter().enumerate() { - if *tb_id != table_id { - continue; - } + let seq = seq_id_list.seq; + let mut tb_id_list = seq_id_list.data; - tb_id_list.id_list.remove(i); + // remove table_id from tb_id_list: + { + let index = tb_id_list.id_list.iter().position(|&x| x == table_id); + let Some(index) = index else { + return Ok(()); + }; - let mut txn = TxnRequest::default(); + tb_id_list.id_list.remove(index); + } - // construct the txn request - txn.condition.push( - // condition: table id list not changed - txn_cond_seq(&dbid_tbname_idlist, Eq, tb_id_list_seq), - ); + let mut txn = TxnRequest::default(); - if tb_id_list.id_list.is_empty() { - txn.if_then.push(txn_op_del(&dbid_tbname_idlist)); - } else { - // save new table id list - txn.if_then.push(txn_op_put( - &dbid_tbname_idlist, - serialize_struct(&tb_id_list)?, - )); - } - gc_dropped_table_data(kv_api, table_id, &mut txn).await?; - gc_dropped_table_index(kv_api, tenant, table_id, &mut txn).await?; + // construct the txn request + txn.condition.push( + // condition: table id list not changed + txn_cond_eq_seq(&table_id_history_ident, seq), + ); - let _resp = kv_api.transaction(txn).await?; - break; + if tb_id_list.id_list.is_empty() { + txn.if_then.push(txn_op_del(&table_id_history_ident)); + } else { + // save new table id list + txn.if_then + .push(txn_op_put_pb(&table_id_history_ident, &tb_id_list, None)?); } + let table_id_ident = TableId { table_id }; + remove_copied_files_for_dropped_table(kv_api, &table_id_ident).await?; + remove_data_for_dropped_table(kv_api, &table_id_ident, &mut txn).await?; + remove_index_for_dropped_table(kv_api, tenant, &table_id_ident, &mut txn).await?; + + let _resp = kv_api.transaction(txn).await?; + Ok(()) } -async fn gc_dropped_table_data( +async fn remove_data_for_dropped_table( kv_api: &(impl kvapi::KVApi + ?Sized), - table_id: u64, + table_id: &TableId, txn: &mut TxnRequest, ) -> Result<(), KVAppError> { - let tbid = TableId { table_id }; - let id_to_name = TableIdToName { table_id }; + let seq_meta = kv_api.get_pb(table_id).await?; - // Get meta data - let (tb_meta_seq, tb_meta): (_, Option) = get_pb_value(kv_api, &tbid).await?; - - if tb_meta_seq == 0 || tb_meta.is_none() { + let Some(seq_meta) = seq_meta else { error!( "gc_dropped_table_by_id cannot find {:?} table_meta", table_id ); return Ok(()); - } + }; + + // TODO: enable this check. Currently when gc db, the table may not be dropped. + // if seq_meta.data.drop_on.is_none() { + // warn!("gc_dropped_table_by_id {:?} is not dropped", table_id); + // return Ok(()); + // } + + txn_delete_exact(txn, table_id, seq_meta.seq); // Get id -> name mapping - let (name_seq, _name): (_, Option) = get_pb_value(kv_api, &id_to_name).await?; + let id_to_name = TableIdToName { + table_id: table_id.table_id, + }; + let seq_name = kv_api.get_pb(&id_to_name).await?; - // table id not changed - txn.condition.push(txn_cond_seq(&tbid, Eq, tb_meta_seq)); // consider only when TableIdToName exist - if name_seq != 0 { - // table id to name not changed - txn.condition.push(txn_cond_seq(&id_to_name, Eq, name_seq)); - // remove table id to name - txn.if_then.push(txn_op_del(&id_to_name)); + if let Some(seq_name) = seq_name { + txn_delete_exact(txn, &id_to_name, seq_name.seq); } - // remove table meta - txn.if_then.push(txn_op_del(&tbid)); - - remove_table_copied_files(kv_api, table_id, txn).await?; Ok(()) } -async fn gc_dropped_table_index( +async fn remove_index_for_dropped_table( kv_api: &(impl kvapi::KVApi + ?Sized), tenant: &Tenant, - table_id: u64, + table_id: &TableId, txn: &mut TxnRequest, ) -> Result<(), KVAppError> { let name_id_metas = kv_api .list_indexes(ListIndexesReq { tenant: tenant.clone(), - table_id: Some(table_id), + table_id: Some(table_id.table_id), }) .await?; diff --git a/src/meta/api/src/schema_api_test_suite.rs b/src/meta/api/src/schema_api_test_suite.rs index 917b99ee945a..ac7fb4e794a3 100644 --- a/src/meta/api/src/schema_api_test_suite.rs +++ b/src/meta/api/src/schema_api_test_suite.rs @@ -80,7 +80,6 @@ use databend_common_meta_app::schema::ExtendLockRevReq; use databend_common_meta_app::schema::GcDroppedTableReq; use databend_common_meta_app::schema::GetDatabaseReq; use databend_common_meta_app::schema::GetSequenceNextValueReq; -use databend_common_meta_app::schema::GetSequenceReq; use databend_common_meta_app::schema::GetTableCopiedFileReq; use databend_common_meta_app::schema::GetTableReq; use databend_common_meta_app::schema::IcebergCatalogOption; @@ -2599,7 +2598,7 @@ impl SchemaApiTestSuite { let upsert_source_table = UpsertTableCopiedFileReq { file_info, ttl: None, - fail_if_duplicated: true, + insert_if_not_exists: true, }; let req = UpdateTableMetaReq { @@ -2649,7 +2648,7 @@ impl SchemaApiTestSuite { let upsert_source_table = UpsertTableCopiedFileReq { file_info, ttl: None, - fail_if_duplicated: true, + insert_if_not_exists: true, }; let req = UpdateTableMetaReq { table_id, @@ -2698,7 +2697,7 @@ impl SchemaApiTestSuite { let upsert_source_table = UpsertTableCopiedFileReq { file_info, ttl: None, - fail_if_duplicated: true, + insert_if_not_exists: true, }; let req = UpdateTableMetaReq { table_id, @@ -3558,7 +3557,7 @@ impl SchemaApiTestSuite { let copied_file_req = UpsertTableCopiedFileReq { file_info: file_info.clone(), ttl: Some(std::time::Duration::from_secs(86400)), - fail_if_duplicated: true, + insert_if_not_exists: true, }; let req = UpdateTableMetaReq { @@ -3722,7 +3721,7 @@ impl SchemaApiTestSuite { let copied_file_req = UpsertTableCopiedFileReq { file_info: file_info.clone(), ttl: Some(std::time::Duration::from_secs(86400)), - fail_if_duplicated: true, + insert_if_not_exists: true, }; let req = UpdateTableMetaReq { @@ -5598,12 +5597,11 @@ impl SchemaApiTestSuite { info!("--- get sequence"); { - let req = GetSequenceReq { - ident: SequenceIdent::new(&tenant, sequence_name), - }; - let resp = mt.get_sequence(req).await?; - assert_eq!(resp.meta.comment, Some("seq".to_string())); - assert_eq!(resp.meta.current, 1); + let req = SequenceIdent::new(&tenant, sequence_name); + let resp = mt.get_sequence(&req).await?; + let resp = resp.unwrap().data; + assert_eq!(resp.comment, Some("seq".to_string())); + assert_eq!(resp.current, 1); } info!("--- get sequence nextval"); @@ -5619,13 +5617,11 @@ impl SchemaApiTestSuite { info!("--- get sequence after nextval"); { - let req = GetSequenceReq { - ident: SequenceIdent::new(&tenant, sequence_name), - }; - - let resp = mt.get_sequence(req).await?; - assert_eq!(resp.meta.comment, Some("seq".to_string())); - assert_eq!(resp.meta.current, 11); + let req = SequenceIdent::new(&tenant, sequence_name); + let resp = mt.get_sequence(&req).await?; + let resp = resp.unwrap().data; + assert_eq!(resp.comment, Some("seq".to_string())); + assert_eq!(resp.current, 11); } info!("--- replace sequence"); @@ -5639,13 +5635,12 @@ impl SchemaApiTestSuite { let _resp = mt.create_sequence(req).await?; - let req = GetSequenceReq { - ident: SequenceIdent::new(&tenant, sequence_name), - }; + let req = SequenceIdent::new(&tenant, sequence_name); - let resp = mt.get_sequence(req).await?; - assert_eq!(resp.meta.comment, Some("seq1".to_string())); - assert_eq!(resp.meta.current, 1); + let resp = mt.get_sequence(&req).await?; + let resp = resp.unwrap().data; + assert_eq!(resp.comment, Some("seq1".to_string())); + assert_eq!(resp.current, 1); } { @@ -5656,12 +5651,9 @@ impl SchemaApiTestSuite { let _resp = mt.drop_sequence(req).await?; - let req = GetSequenceReq { - ident: SequenceIdent::new(&tenant, sequence_name), - }; - - let resp = mt.get_sequence(req).await; - assert!(resp.is_err()); + let req = SequenceIdent::new(&tenant, sequence_name); + let resp = mt.get_sequence(&req).await?; + assert!(resp.is_none()); } Ok(()) @@ -5734,7 +5726,7 @@ impl SchemaApiTestSuite { let copied_file_req = UpsertTableCopiedFileReq { file_info: file_info.clone(), ttl: Some(std::time::Duration::from_secs(86400)), - fail_if_duplicated: true, + insert_if_not_exists: true, }; let req = UpdateTableMetaReq { @@ -5784,7 +5776,7 @@ impl SchemaApiTestSuite { file_info: file_info.clone(), // Make it expire at once. ttl: Some(std::time::Duration::from_secs(0)), - fail_if_duplicated: true, + insert_if_not_exists: true, }; let req = UpdateTableMetaReq { @@ -7213,7 +7205,7 @@ impl SchemaApiTestSuite { let copied_file_req = UpsertTableCopiedFileReq { file_info: file_info.clone(), ttl: Some(std::time::Duration::from_secs(86400)), - fail_if_duplicated: true, + insert_if_not_exists: true, }; let req = UpdateTableMetaReq { @@ -7271,7 +7263,7 @@ impl SchemaApiTestSuite { let copied_file_req = UpsertTableCopiedFileReq { file_info: file_info.clone(), ttl: Some(std::time::Duration::from_secs(86400)), - fail_if_duplicated: true, + insert_if_not_exists: true, }; let req = UpdateTableMetaReq { @@ -7326,7 +7318,7 @@ impl SchemaApiTestSuite { let copied_file_req = UpsertTableCopiedFileReq { file_info: file_info.clone(), ttl: Some(std::time::Duration::from_secs(86400)), - fail_if_duplicated: false, + insert_if_not_exists: false, }; let req = UpdateTableMetaReq { @@ -7687,7 +7679,7 @@ where MT: SchemaApi + kvapi::AsKVApi let copied_file_req = UpsertTableCopiedFileReq { file_info: file_infos.clone(), ttl: Some(std::time::Duration::from_secs(86400)), - fail_if_duplicated: true, + insert_if_not_exists: true, }; let req = UpdateTableMetaReq { diff --git a/src/meta/api/src/sequence_api.rs b/src/meta/api/src/sequence_api.rs index 609a2bf4a4e7..9c48617d062c 100644 --- a/src/meta/api/src/sequence_api.rs +++ b/src/meta/api/src/sequence_api.rs @@ -18,8 +18,10 @@ use databend_common_meta_app::schema::DropSequenceReply; use databend_common_meta_app::schema::DropSequenceReq; use databend_common_meta_app::schema::GetSequenceNextValueReply; use databend_common_meta_app::schema::GetSequenceNextValueReq; -use databend_common_meta_app::schema::GetSequenceReply; -use databend_common_meta_app::schema::GetSequenceReq; +use databend_common_meta_app::schema::SequenceIdent; +use databend_common_meta_app::schema::SequenceMeta; +use databend_common_meta_types::MetaError; +use databend_common_meta_types::SeqV; use crate::kv_app_error::KVAppError; @@ -30,7 +32,10 @@ pub trait SequenceApi: Send + Sync { req: CreateSequenceReq, ) -> Result; - async fn get_sequence(&self, req: GetSequenceReq) -> Result; + async fn get_sequence( + &self, + req: &SequenceIdent, + ) -> Result>, MetaError>; async fn get_sequence_next_value( &self, diff --git a/src/meta/api/src/sequence_api_impl.rs b/src/meta/api/src/sequence_api_impl.rs index d59f26e5e83b..6183d0cfd5e8 100644 --- a/src/meta/api/src/sequence_api_impl.rs +++ b/src/meta/api/src/sequence_api_impl.rs @@ -12,15 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::fmt::Display; - use chrono::Utc; use databend_common_meta_app::app_error::AppError; -use databend_common_meta_app::app_error::CreateSequenceError; use databend_common_meta_app::app_error::OutofSequenceRange; -use databend_common_meta_app::app_error::SequenceAlreadyExists; use databend_common_meta_app::app_error::SequenceError; -use databend_common_meta_app::app_error::UnknownSequence; use databend_common_meta_app::app_error::WrongSequenceCount; use databend_common_meta_app::schema::CreateOption; use databend_common_meta_app::schema::CreateSequenceReply; @@ -29,28 +24,24 @@ use databend_common_meta_app::schema::DropSequenceReply; use databend_common_meta_app::schema::DropSequenceReq; use databend_common_meta_app::schema::GetSequenceNextValueReply; use databend_common_meta_app::schema::GetSequenceNextValueReq; -use databend_common_meta_app::schema::GetSequenceReply; -use databend_common_meta_app::schema::GetSequenceReq; use databend_common_meta_app::schema::SequenceIdent; use databend_common_meta_app::schema::SequenceMeta; use databend_common_meta_kvapi::kvapi; -use databend_common_meta_types::ConditionResult::Eq; use databend_common_meta_types::MatchSeq; use databend_common_meta_types::MetaError; +use databend_common_meta_types::SeqV; use databend_common_meta_types::TxnRequest; use fastrace::func_name; use log::debug; use crate::databend_common_meta_types::With; -use crate::get_pb_value; use crate::kv_app_error::KVAppError; use crate::kv_pb_api::KVPbApi; use crate::kv_pb_api::UpsertPB; use crate::send_txn; -use crate::serialize_struct; use crate::txn_backoff::txn_backoff; -use crate::txn_cond_seq; -use crate::txn_op_put; +use crate::txn_cond_eq_seq; +use crate::util::txn_op_put_pb; use crate::SequenceApi; #[async_trait::async_trait] @@ -62,7 +53,6 @@ impl + ?Sized> SequenceApi for KV { ) -> Result { debug!(req :? =(&req); "SchemaApi: {}", func_name!()); - let sequence_name = req.ident.name(); let meta: SequenceMeta = req.clone().into(); let seq = MatchSeq::from(req.create_option); @@ -81,19 +71,11 @@ impl + ?Sized> SequenceApi for KV { if !reply.is_changed() { match req.create_option { CreateOption::Create => Err(KVAppError::AppError(AppError::SequenceError( - SequenceError::SequenceAlreadyExists(SequenceAlreadyExists::new( - sequence_name, - format!("create sequence: {:?}", sequence_name), - )), + SequenceError::SequenceAlreadyExists(req.ident.exist_error(func_name!())), ))), CreateOption::CreateIfNotExists => Ok(CreateSequenceReply {}), CreateOption::CreateOrReplace => { - Err(KVAppError::AppError(AppError::SequenceError( - SequenceError::CreateSequenceError(CreateSequenceError::new( - sequence_name, - format!("replace sequence: {:?} fail", sequence_name), - )), - ))) + unreachable!("CreateOrReplace should always success") } } } else { @@ -101,19 +83,13 @@ impl + ?Sized> SequenceApi for KV { } } - async fn get_sequence(&self, req: GetSequenceReq) -> Result { - debug!(req :? =(&req); "SchemaApi: {}", func_name!()); - let sequence_name = req.ident.name(); - let (_sequence_seq, sequence_meta) = get_sequence_or_err( - self, - &req.ident, - format!("get_sequence_next_values: {:?}", sequence_name), - ) - .await?; - - Ok(GetSequenceReply { - meta: sequence_meta, - }) + async fn get_sequence( + &self, + name_ident: &SequenceIdent, + ) -> Result>, MetaError> { + debug!(req :? =name_ident; "SchemaApi: {}", func_name!()); + let seq_meta = self.get_pb(name_ident).await?; + Ok(seq_meta) } async fn get_sequence_next_value( @@ -133,12 +109,15 @@ impl + ?Sized> SequenceApi for KV { let mut trials = txn_backoff(None, func_name!()); loop { trials.next().unwrap()?.await; - let (sequence_seq, mut sequence_meta) = get_sequence_or_err( - self, - &ident, - format!("get_sequence_next_values: {:?}", sequence_name), - ) - .await?; + let seq_meta = self.get_pb(&ident).await?; + let Some(seq_meta) = seq_meta else { + return Err(AppError::SequenceError(SequenceError::UnknownSequence( + ident.unknown_error(func_name!()), + )) + .into()); + }; + let sequence_seq = seq_meta.seq; + let mut sequence_meta = seq_meta.data; let start = sequence_meta.current; let count = req.count; @@ -158,9 +137,9 @@ impl + ?Sized> SequenceApi for KV { sequence_meta.current += count; sequence_meta.update_on = Utc::now(); - let condition = vec![txn_cond_seq(&ident, Eq, sequence_seq)]; + let condition = vec![txn_cond_eq_seq(&ident, sequence_seq)]; let if_then = vec![ - txn_op_put(&ident, serialize_struct(&sequence_meta)?), // name -> meta + txn_op_put_pb(&ident, &sequence_meta, None)?, // name -> meta ]; let txn_req = TxnRequest { @@ -206,25 +185,3 @@ impl + ?Sized> SequenceApi for KV { Ok(DropSequenceReply { prev }) } } - -/// Returns (seq, sequence_meta) -async fn get_sequence_or_err( - kv_api: &(impl kvapi::KVApi + ?Sized), - key: &SequenceIdent, - msg: impl Display, -) -> Result<(u64, SequenceMeta), KVAppError> { - let (sequence_seq, sequence_meta) = get_pb_value(kv_api, key).await?; - - if sequence_seq == 0 { - debug!(seq = sequence_seq, SequenceIdent :?= (key); "sequence does not exist"); - - Err(KVAppError::AppError(AppError::SequenceError( - SequenceError::UnknownSequence(UnknownSequence::new( - key.name(), - format!("{}: {:?}", msg, key.name()), - )), - ))) - } else { - Ok((sequence_seq, sequence_meta.unwrap())) - } -} diff --git a/src/meta/app/Cargo.toml b/src/meta/app/Cargo.toml index c23211545cc4..d88de76ebe0b 100644 --- a/src/meta/app/Cargo.toml +++ b/src/meta/app/Cargo.toml @@ -19,6 +19,7 @@ chrono = { workspace = true } chrono-tz = { workspace = true } cron = "0.12.0" databend-common-ast = { workspace = true } +databend-common-base = { workspace = true } databend-common-exception = { workspace = true } databend-common-expression = { workspace = true } databend-common-io = { workspace = true } diff --git a/src/meta/app/src/app_error.rs b/src/meta/app/src/app_error.rs index b79904963f59..c40629c5124b 100644 --- a/src/meta/app/src/app_error.rs +++ b/src/meta/app/src/app_error.rs @@ -26,6 +26,7 @@ use crate::schema::dictionary_name_ident; use crate::schema::index_name_ident; use crate::schema::virtual_column_ident; use crate::schema::DictionaryIdentity; +use crate::schema::SequenceRsc; use crate::tenant_key::errors::ExistError; use crate::tenant_key::errors::UnknownError; use crate::tenant_key::ident::TIdent; @@ -866,54 +867,6 @@ impl IndexColumnIdNotFound { } } -#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)] -#[error("CreateSequenceError: `{name}` while `{context}`")] -pub struct CreateSequenceError { - name: String, - context: String, -} - -impl CreateSequenceError { - pub fn new(name: impl ToString, context: impl ToString) -> Self { - Self { - name: name.to_string(), - context: context.to_string(), - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)] -#[error("SequenceAlreadyExists: `{name}` while `{context}`")] -pub struct SequenceAlreadyExists { - name: String, - context: String, -} - -impl SequenceAlreadyExists { - pub fn new(name: impl ToString, context: impl ToString) -> Self { - Self { - name: name.to_string(), - context: context.to_string(), - } - } -} - -#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)] -#[error("UnknownSequence: `{name}` while `{context}`")] -pub struct UnknownSequence { - name: String, - context: String, -} - -impl UnknownSequence { - pub fn new(name: impl ToString, context: impl ToString) -> Self { - Self { - name: name.to_string(), - context: context.to_string(), - } - } -} - #[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)] #[error("OutofSequenceRange: `{name}` while `{context}`")] pub struct OutofSequenceRange { @@ -1175,13 +1128,10 @@ impl AppError { #[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)] pub enum SequenceError { #[error(transparent)] - CreateSequenceError(#[from] CreateSequenceError), - - #[error(transparent)] - SequenceAlreadyExists(#[from] SequenceAlreadyExists), + SequenceAlreadyExists(#[from] ExistError), #[error(transparent)] - UnknownSequence(#[from] UnknownSequence), + UnknownSequence(#[from] UnknownError), #[error(transparent)] OutofSequenceRange(#[from] OutofSequenceRange), @@ -1500,24 +1450,6 @@ impl AppErrorMessage for UnmatchMaskPolicyReturnType { } } -impl AppErrorMessage for CreateSequenceError { - fn message(&self) -> String { - format!("Create Sequence {} Error", self.name) - } -} - -impl AppErrorMessage for SequenceAlreadyExists { - fn message(&self) -> String { - format!("Sequence '{}' already exists", self.name) - } -} - -impl AppErrorMessage for UnknownSequence { - fn message(&self) -> String { - format!("Sequence '{}' does not exists", self.name) - } -} - impl AppErrorMessage for OutofSequenceRange { fn message(&self) -> String { format!("Sequence '{}' out of range", self.name) @@ -1533,9 +1465,6 @@ impl AppErrorMessage for WrongSequenceCount { impl AppErrorMessage for SequenceError { fn message(&self) -> String { match self { - SequenceError::CreateSequenceError(e) => { - format!("CreateSequenceError: '{}'", e.message()) - } SequenceError::SequenceAlreadyExists(e) => { format!("SequenceAlreadyExists: '{}'", e.message()) } @@ -1675,7 +1604,6 @@ impl From for ErrorCode { impl From for ErrorCode { fn from(app_err: SequenceError) -> Self { match app_err { - SequenceError::CreateSequenceError(err) => ErrorCode::SequenceError(err.message()), SequenceError::SequenceAlreadyExists(err) => ErrorCode::SequenceError(err.message()), SequenceError::UnknownSequence(err) => ErrorCode::SequenceError(err.message()), SequenceError::OutofSequenceRange(err) => ErrorCode::SequenceError(err.message()), diff --git a/src/meta/app/src/principal/procedure.rs b/src/meta/app/src/principal/procedure.rs index 003d3a01dcad..e6d693edb017 100644 --- a/src/meta/app/src/principal/procedure.rs +++ b/src/meta/app/src/principal/procedure.rs @@ -19,6 +19,7 @@ use std::ops::Deref; use chrono::DateTime; use chrono::Utc; +use databend_common_base::display::display_slice::DisplaySliceExt; use databend_common_expression::types::DataType; use crate::principal::procedure_id_ident::ProcedureIdIdent; @@ -45,6 +46,7 @@ pub struct ProcedureIdent { #[derive(Clone, Debug, PartialEq)] pub struct ProcedureMeta { pub return_types: Vec, + pub arg_names: Vec, pub created_on: DateTime, pub updated_on: DateTime, pub script: String, @@ -56,6 +58,7 @@ impl Default for ProcedureMeta { fn default() -> Self { ProcedureMeta { return_types: vec![], + arg_names: vec![], created_on: Utc::now(), updated_on: Utc::now(), script: "".to_string(), @@ -69,8 +72,13 @@ impl Display for ProcedureMeta { fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!( f, - "Lanuage: {:?}, return_type: {:?}, CreatedOn: {:?}, Script: {:?}, Comment: {:?}", - self.procedure_language, self.return_types, self.created_on, self.script, self.comment + "Lanuage: {:?}, args {} return_type: {}, CreatedOn: {:?}, Script: {}, Comment: {:?}", + self.procedure_language, + self.arg_names.display_n::<1000>(), + self.return_types.display_n::<1000>(), + self.created_on, + self.script, + self.comment ) } } diff --git a/src/meta/app/src/schema/database.rs b/src/meta/app/src/schema/database.rs index 04dd518412fc..e5997e5416ad 100644 --- a/src/meta/app/src/schema/database.rs +++ b/src/meta/app/src/schema/database.rs @@ -69,8 +69,29 @@ pub struct DatabaseMeta { pub updated_on: DateTime, pub comment: String, - // if used in CreateDatabaseReq, this field MUST set to None. + /// if used in CreateDatabaseReq, this field MUST set to None. pub drop_on: Option>, + + /// Indicates whether garbage collection is currently in progress for this dropped database. + /// + /// If it is in progress, the database should not be un-dropped, because the data may be incomplete. + /// + /// ```text + /// normal <----. + /// | | + /// | drop() | undrop() + /// v | + /// dropped ----' + /// | + /// | gc() + /// v + /// gc_in_progress=True + /// | + /// | purge data from meta-service + /// v + /// completed removed + /// ``` + pub gc_in_progress: bool, } impl Default for DatabaseMeta { @@ -83,6 +104,7 @@ impl Default for DatabaseMeta { updated_on: Utc::now(), comment: "".to_string(), drop_on: None, + gc_in_progress: false, } } } diff --git a/src/meta/app/src/schema/sequence.rs b/src/meta/app/src/schema/sequence.rs index 6e15eb107c49..2392dd938bdb 100644 --- a/src/meta/app/src/schema/sequence.rs +++ b/src/meta/app/src/schema/sequence.rs @@ -14,13 +14,13 @@ use chrono::DateTime; use chrono::Utc; -use kvapi_impl::Resource; +pub use kvapi_impl::SequenceRsc; use super::CreateOption; use crate::tenant_key::ident::TIdent; /// Defines the meta-service key for sequence. -pub type SequenceIdent = TIdent; +pub type SequenceIdent = TIdent; #[derive(Clone, Debug, PartialEq, Eq)] pub struct SequenceMeta { @@ -99,8 +99,8 @@ mod kvapi_impl { use super::SequenceMeta; use crate::tenant_key::resource::TenantResource; - pub struct Resource; - impl TenantResource for Resource { + pub struct SequenceRsc; + impl TenantResource for SequenceRsc { const PREFIX: &'static str = "__fd_sequence"; const HAS_TENANT: bool = true; type ValueType = SequenceMeta; @@ -113,3 +113,34 @@ mod kvapi_impl { } } } + +#[cfg(test)] +mod tests { + use databend_common_meta_kvapi::kvapi::Key; + + use crate::schema::SequenceIdent; + use crate::tenant::Tenant; + + #[test] + fn test_sequence_ident() { + let tenant = Tenant::new_literal("dummy"); + let ident = SequenceIdent::new_generic(tenant, "3".to_string()); + + let key = ident.to_string_key(); + assert_eq!(key, "__fd_sequence/dummy/3"); + + assert_eq!(ident, SequenceIdent::from_str_key(&key).unwrap()); + } + + #[test] + fn test_sequence_ident_with_key_space() { + // TODO(xp): implement this test + // let tenant = Tenant::new_literal("test"); + // let ident = IndexIdIdent::new(tenant, 3); + // + // let key = ident.to_string_key(); + // assert_eq!(key, "__fd_catalog_by_id/3"); + // + // assert_eq!(ident, IndexIdIdent::from_str_key(&key).unwrap()); + } +} diff --git a/src/meta/app/src/schema/table.rs b/src/meta/app/src/schema/table.rs index b083c0e13535..dab3a40c2a35 100644 --- a/src/meta/app/src/schema/table.rs +++ b/src/meta/app/src/schema/table.rs @@ -961,6 +961,16 @@ pub struct TableCopiedFileNameIdent { pub file: String, } +impl fmt::Display for TableCopiedFileNameIdent { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "TableCopiedFileNameIdent{{table_id:{}, file:{}}}", + self.table_id, self.file + ) + } +} + #[derive(Clone, Debug, Eq, PartialEq, Default)] pub struct TableCopiedFileInfo { pub etag: Option, @@ -984,7 +994,8 @@ pub struct UpsertTableCopiedFileReq { pub file_info: BTreeMap, /// If not None, specifies the time-to-live for the keys. pub ttl: Option, - pub fail_if_duplicated: bool, + /// If there is already existing key, ignore inserting + pub insert_if_not_exists: bool, } #[derive(Clone, Debug, PartialEq, Eq)] diff --git a/src/meta/proto-conv/src/database_from_to_protobuf_impl.rs b/src/meta/proto-conv/src/database_from_to_protobuf_impl.rs index f4d533d98aa2..94c186b7c6e6 100644 --- a/src/meta/proto-conv/src/database_from_to_protobuf_impl.rs +++ b/src/meta/proto-conv/src/database_from_to_protobuf_impl.rs @@ -44,6 +44,7 @@ impl FromToProto for mt::DatabaseMeta { Some(drop_on) => Some(DateTime::::from_pb(drop_on)?), None => None, }, + gc_in_progress: p.gc_in_progress, comment: p.comment, }; Ok(v) @@ -62,6 +63,7 @@ impl FromToProto for mt::DatabaseMeta { Some(drop_on) => Some(drop_on.to_pb()?), None => None, }, + gc_in_progress: self.gc_in_progress, comment: self.comment.clone(), shared_by: vec![], from_share: None, diff --git a/src/meta/proto-conv/src/procedure_from_to_protobuf_impl.rs b/src/meta/proto-conv/src/procedure_from_to_protobuf_impl.rs index 4554585a0988..75955c1d2248 100644 --- a/src/meta/proto-conv/src/procedure_from_to_protobuf_impl.rs +++ b/src/meta/proto-conv/src/procedure_from_to_protobuf_impl.rs @@ -70,6 +70,7 @@ impl FromToProto for mt::principal::ProcedureMeta { let v = Self { return_types, + arg_names: p.arg_names.clone(), created_on: DateTime::::from_pb(p.created_on)?, updated_on: DateTime::::from_pb(p.updated_on)?, script: p.script, @@ -94,6 +95,7 @@ impl FromToProto for mt::principal::ProcedureMeta { ver: VER, min_reader_ver: MIN_READER_VER, return_types, + arg_names: self.arg_names.clone(), created_on: self.created_on.to_pb()?, updated_on: self.updated_on.to_pb()?, script: self.script.to_string(), diff --git a/src/meta/proto-conv/src/util.rs b/src/meta/proto-conv/src/util.rs index ca7380806287..b86e0b8a37a6 100644 --- a/src/meta/proto-conv/src/util.rs +++ b/src/meta/proto-conv/src/util.rs @@ -137,7 +137,9 @@ const META_CHANGE_LOG: &[(u64, &str)] = &[ (105, "2024-08-05: Add: add Dictionary meta"), (106, "2024-08-08: Add: add QueryTokenInfo"), (107, "2024-08-09: Add: datatype.proto/DataType Geography type"), - (108, "2024-08-29: Add: procedure.proto: ProcedureMeta and ProcedureIdentity") + (108, "2024-08-29: Add: procedure.proto: ProcedureMeta and ProcedureIdentity"), + (109, "2024-08-29: Refactor: ProcedureMeta add arg_names"), + (110, "2024-09-18: Add: database.proto: DatabaseMeta.gc_in_progress"), // Dear developer: // If you're gonna add a new metadata version, you'll have to add a test for it. // You could just copy an existing test file(e.g., `../tests/it/v024_table_meta.rs`) diff --git a/src/meta/proto-conv/tests/it/main.rs b/src/meta/proto-conv/tests/it/main.rs index 0ef441368036..2ab430fb1ad1 100644 --- a/src/meta/proto-conv/tests/it/main.rs +++ b/src/meta/proto-conv/tests/it/main.rs @@ -106,3 +106,5 @@ mod v105_dictionary_meta; mod v106_query_token; mod v107_geography_datatype; mod v108_procedure; +mod v109_procedure_with_args; +mod v110_database_meta_gc_in_progress; diff --git a/src/meta/proto-conv/tests/it/proto_conv.rs b/src/meta/proto-conv/tests/it/proto_conv.rs index 3deb9f42a37d..bbf33928eb39 100644 --- a/src/meta/proto-conv/tests/it/proto_conv.rs +++ b/src/meta/proto-conv/tests/it/proto_conv.rs @@ -52,6 +52,7 @@ fn new_db_meta_share() -> mt::DatabaseMeta { updated_on: Utc.with_ymd_and_hms(2014, 11, 29, 12, 0, 9).unwrap(), comment: "foo bar".to_string(), drop_on: None, + gc_in_progress: false, } } @@ -64,6 +65,7 @@ fn new_db_meta() -> mt::DatabaseMeta { updated_on: Utc.with_ymd_and_hms(2014, 11, 29, 12, 0, 9).unwrap(), comment: "foo bar".to_string(), drop_on: None, + gc_in_progress: false, } } diff --git a/src/meta/proto-conv/tests/it/v002_database_meta.rs b/src/meta/proto-conv/tests/it/v002_database_meta.rs index 914b08432430..802f6c5a3e7c 100644 --- a/src/meta/proto-conv/tests/it/v002_database_meta.rs +++ b/src/meta/proto-conv/tests/it/v002_database_meta.rs @@ -48,6 +48,7 @@ fn test_decode_v2_database_meta() -> anyhow::Result<()> { updated_on: Utc.with_ymd_and_hms(2014, 11, 29, 12, 0, 9).unwrap(), comment: "foo bar".to_string(), drop_on: None, + gc_in_progress: false, }; common::test_pb_from_to(func_name!(), want())?; diff --git a/src/meta/proto-conv/tests/it/v005_database_meta.rs b/src/meta/proto-conv/tests/it/v005_database_meta.rs index 8cc7cb67b5ae..515b67f0dfc8 100644 --- a/src/meta/proto-conv/tests/it/v005_database_meta.rs +++ b/src/meta/proto-conv/tests/it/v005_database_meta.rs @@ -49,6 +49,7 @@ fn test_decode_v5_database_meta() -> anyhow::Result<()> { updated_on: Utc.with_ymd_and_hms(2014, 11, 29, 12, 0, 9).unwrap(), comment: "foo bar".to_string(), drop_on: None, + gc_in_progress: false, }; common::test_pb_from_to(func_name!(), want())?; diff --git a/src/meta/proto-conv/tests/it/v055_table_meta.rs b/src/meta/proto-conv/tests/it/v055_table_meta.rs index 879d18ade2e4..6d10be03331e 100644 --- a/src/meta/proto-conv/tests/it/v055_table_meta.rs +++ b/src/meta/proto-conv/tests/it/v055_table_meta.rs @@ -119,6 +119,7 @@ fn test_decode_v51_database_meta() -> anyhow::Result<()> { updated_on: Utc.with_ymd_and_hms(2014, 11, 29, 12, 0, 9).unwrap(), comment: "foo bar".to_string(), drop_on: None, + gc_in_progress: false, }; common::test_pb_from_to(func_name!(), want())?; diff --git a/src/meta/proto-conv/tests/it/v074_table_db_meta.rs b/src/meta/proto-conv/tests/it/v074_table_db_meta.rs index d207108cb274..39a54e2f4cdc 100644 --- a/src/meta/proto-conv/tests/it/v074_table_db_meta.rs +++ b/src/meta/proto-conv/tests/it/v074_table_db_meta.rs @@ -116,6 +116,7 @@ fn test_decode_v74_database_meta() -> anyhow::Result<()> { updated_on: Utc.with_ymd_and_hms(2014, 11, 29, 12, 0, 9).unwrap(), comment: "foo bar".to_string(), drop_on: None, + gc_in_progress: false, }; common::test_pb_from_to(func_name!(), want())?; diff --git a/src/meta/proto-conv/tests/it/v096_database_meta.rs b/src/meta/proto-conv/tests/it/v096_database_meta.rs index 1ef71b3b6d83..fb9eec2a66e0 100644 --- a/src/meta/proto-conv/tests/it/v096_database_meta.rs +++ b/src/meta/proto-conv/tests/it/v096_database_meta.rs @@ -39,6 +39,7 @@ fn test_decode_v96_database_meta() -> anyhow::Result<()> { updated_on: Utc.with_ymd_and_hms(2014, 11, 29, 12, 0, 9).unwrap(), comment: "foo bar".to_string(), drop_on: None, + gc_in_progress: false, }; common::test_pb_from_to(func_name!(), want())?; diff --git a/src/meta/proto-conv/tests/it/v101_database_meta.rs b/src/meta/proto-conv/tests/it/v101_database_meta.rs index 83485bdf8583..ab36a56389b1 100644 --- a/src/meta/proto-conv/tests/it/v101_database_meta.rs +++ b/src/meta/proto-conv/tests/it/v101_database_meta.rs @@ -40,6 +40,7 @@ fn v101_database_meta() -> anyhow::Result<()> { updated_on: Utc.with_ymd_and_hms(2014, 11, 29, 12, 0, 9).unwrap(), comment: "foo bar".to_string(), drop_on: None, + gc_in_progress: false, }; common::test_pb_from_to(func_name!(), want())?; diff --git a/src/meta/proto-conv/tests/it/v108_procedure.rs b/src/meta/proto-conv/tests/it/v108_procedure.rs index 442397c9146a..f5ea50f002d0 100644 --- a/src/meta/proto-conv/tests/it/v108_procedure.rs +++ b/src/meta/proto-conv/tests/it/v108_procedure.rs @@ -22,15 +22,16 @@ use crate::common; #[test] fn v108_procedure_meta() -> anyhow::Result<()> { - let procedure_meta_v108: Vec = vec![ - 34, 9, 146, 2, 0, 160, 6, 108, 168, 6, 24, 82, 23, 50, 48, 49, 52, 45, 49, 49, 45, 50, 56, + let procedure_meta_v108 = vec![ + 34, 9, 146, 2, 0, 160, 6, 109, 168, 6, 24, 82, 23, 50, 48, 49, 52, 45, 49, 49, 45, 50, 56, 32, 49, 50, 58, 48, 48, 58, 48, 57, 32, 85, 84, 67, 90, 23, 50, 48, 49, 52, 45, 49, 49, 45, 50, 57, 32, 49, 50, 58, 48, 48, 58, 48, 57, 32, 85, 84, 67, 98, 7, 102, 111, 111, 32, 98, - 97, 114, 114, 3, 83, 81, 76, 160, 6, 108, 168, 6, 24, + 97, 114, 114, 3, 83, 81, 76, 160, 6, 109, 168, 6, 24, ]; let want = || mt::ProcedureMeta { return_types: vec![DataType::String], + arg_names: vec![], created_on: Utc.with_ymd_and_hms(2014, 11, 28, 12, 0, 9).unwrap(), updated_on: Utc.with_ymd_and_hms(2014, 11, 29, 12, 0, 9).unwrap(), script: "".to_string(), @@ -39,7 +40,7 @@ fn v108_procedure_meta() -> anyhow::Result<()> { }; common::test_pb_from_to(func_name!(), want())?; - common::test_load_old(func_name!(), procedure_meta_v108.as_slice(), 108, want()) + common::test_load_old(func_name!(), procedure_meta_v108.as_slice(), 109, want()) } #[test] diff --git a/src/meta/proto-conv/tests/it/v109_procedure_with_args.rs b/src/meta/proto-conv/tests/it/v109_procedure_with_args.rs new file mode 100644 index 000000000000..513e922cfa1b --- /dev/null +++ b/src/meta/proto-conv/tests/it/v109_procedure_with_args.rs @@ -0,0 +1,44 @@ +// Copyright 2023 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use chrono::TimeZone; +use chrono::Utc; +use databend_common_expression::types::DataType; +use databend_common_meta_app::principal as mt; +use fastrace::func_name; + +use crate::common; + +#[test] +fn v109_procedure_meta() -> anyhow::Result<()> { + let procedure_meta_v109 = vec![ + 34, 9, 146, 2, 0, 160, 6, 109, 168, 6, 24, 42, 7, 109, 101, 115, 115, 97, 103, 101, 82, 23, + 50, 48, 49, 52, 45, 49, 49, 45, 50, 56, 32, 49, 50, 58, 48, 48, 58, 48, 57, 32, 85, 84, 67, + 90, 23, 50, 48, 49, 52, 45, 49, 49, 45, 50, 57, 32, 49, 50, 58, 48, 48, 58, 48, 57, 32, 85, + 84, 67, 98, 7, 102, 111, 111, 32, 98, 97, 114, 114, 3, 83, 81, 76, 160, 6, 109, 168, 6, 24, + ]; + + let want = || mt::ProcedureMeta { + return_types: vec![DataType::String], + arg_names: vec!["message".to_string()], + created_on: Utc.with_ymd_and_hms(2014, 11, 28, 12, 0, 9).unwrap(), + updated_on: Utc.with_ymd_and_hms(2014, 11, 29, 12, 0, 9).unwrap(), + script: "".to_string(), + comment: "foo bar".to_string(), + procedure_language: "SQL".to_string(), + }; + + common::test_pb_from_to(func_name!(), want())?; + common::test_load_old(func_name!(), procedure_meta_v109.as_slice(), 109, want()) +} diff --git a/src/meta/proto-conv/tests/it/v110_database_meta_gc_in_progress.rs b/src/meta/proto-conv/tests/it/v110_database_meta_gc_in_progress.rs new file mode 100644 index 000000000000..23901d89b68a --- /dev/null +++ b/src/meta/proto-conv/tests/it/v110_database_meta_gc_in_progress.rs @@ -0,0 +1,60 @@ +// Copyright 2023 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use chrono::TimeZone; +use chrono::Utc; +use databend_common_meta_app::schema as mt; +use fastrace::func_name; +use maplit::btreemap; + +use crate::common; + +// These bytes are built when a new version in introduced, +// and are kept for backward compatibility test. +// +// ************************************************************* +// * These messages should never be updated, * +// * only be added when a new version is added, * +// * or be removed when an old version is no longer supported. * +// ************************************************************* +// +// The message bytes are built from the output of `test_pb_from_to()` +#[test] +fn test_decode_v110_database_meta() -> anyhow::Result<()> { + let database_meta_v110 = vec![ + 34, 10, 10, 3, 120, 121, 122, 18, 3, 102, 111, 111, 42, 2, 52, 52, 50, 10, 10, 3, 97, 98, + 99, 18, 3, 100, 101, 102, 162, 1, 23, 50, 48, 49, 52, 45, 49, 49, 45, 50, 56, 32, 49, 50, + 58, 48, 48, 58, 48, 57, 32, 85, 84, 67, 170, 1, 23, 50, 48, 49, 52, 45, 49, 49, 45, 50, 57, + 32, 49, 50, 58, 48, 48, 58, 48, 57, 32, 85, 84, 67, 178, 1, 7, 102, 111, 111, 32, 98, 97, + 114, 232, 1, 1, 160, 6, 110, 168, 6, 24, + ]; + + let want = || mt::DatabaseMeta { + engine: "44".to_string(), + engine_options: btreemap! {s("abc") => s("def")}, + options: btreemap! {s("xyz") => s("foo")}, + created_on: Utc.with_ymd_and_hms(2014, 11, 28, 12, 0, 9).unwrap(), + updated_on: Utc.with_ymd_and_hms(2014, 11, 29, 12, 0, 9).unwrap(), + comment: "foo bar".to_string(), + drop_on: None, + gc_in_progress: true, + }; + + common::test_pb_from_to(func_name!(), want())?; + common::test_load_old(func_name!(), database_meta_v110.as_slice(), 110, want()) +} + +fn s(ss: impl ToString) -> String { + ss.to_string() +} diff --git a/src/meta/protos/proto/database.proto b/src/meta/protos/proto/database.proto index ff47219fb813..1af7ee9525d0 100644 --- a/src/meta/protos/proto/database.proto +++ b/src/meta/protos/proto/database.proto @@ -70,6 +70,11 @@ message DatabaseMeta { // The time table dropped. optional string drop_on = 23; + // Indicates whether garbage collection is currently in progress for this dropped database. + // + // If it is in progress, the database should not be un-dropped, because the data may be incomplete. + bool gc_in_progress = 29; + repeated uint64 shared_by = 24; optional TIdent from_share = 25; diff --git a/src/meta/protos/proto/procedure.proto b/src/meta/protos/proto/procedure.proto index 6824c88d18c9..35f6145ab253 100644 --- a/src/meta/protos/proto/procedure.proto +++ b/src/meta/protos/proto/procedure.proto @@ -37,6 +37,7 @@ message ProcedureMeta { // Procedure return type repeated DataType return_types = 4; + repeated string arg_names = 5; // The time database created. string created_on = 10; diff --git a/src/query/ast/src/ast/statements/procedure.rs b/src/query/ast/src/ast/statements/procedure.rs index 1ebf8e82ab7a..336649120ed4 100644 --- a/src/query/ast/src/ast/statements/procedure.rs +++ b/src/query/ast/src/ast/statements/procedure.rs @@ -19,8 +19,8 @@ use derive_visitor::Drive; use derive_visitor::DriveMut; use crate::ast::write_comma_separated_list; -use crate::ast::write_comma_separated_string_list; use crate::ast::CreateOption; +use crate::ast::Expr; use crate::ast::TypeName; #[derive(Debug, Clone, PartialEq, Eq, Drive, DriveMut)] @@ -169,16 +169,17 @@ impl Display for DescProcedureStmt { } } -#[derive(Debug, Clone, PartialEq, Eq, Drive, DriveMut)] +#[derive(Debug, Clone, PartialEq, Drive, DriveMut)] pub struct CallProcedureStmt { pub name: String, - pub args: Vec, + pub args: Vec, } impl Display for CallProcedureStmt { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "CALL PROCEDURE {}(", self.name)?; - write_comma_separated_string_list(f, self.args.clone())?; + let CallProcedureStmt { name, args } = self; + write!(f, "CALL PROCEDURE {}(", name)?; + write_comma_separated_list(f, args)?; write!(f, ")")?; Ok(()) } diff --git a/src/query/ast/src/parser/statement.rs b/src/query/ast/src/parser/statement.rs index b4f9c5bf61ce..428ef7ecf766 100644 --- a/src/query/ast/src/parser/statement.rs +++ b/src/query/ast/src/parser/statement.rs @@ -2125,12 +2125,12 @@ pub fn statement_body(i: Input) -> IResult { let call_procedure = map( rule! { - CALL ~ PROCEDURE ~ #ident ~ "(" ~ ")" + CALL ~ PROCEDURE ~ #ident ~ "(" ~ #comma_separated_list0(subexpr(0))? ~ ")" }, - |(_, _, name, _, _)| { + |(_, _, name, _, opt_args, _)| { Statement::CallProcedure(CallProcedureStmt { name: name.to_string(), - args: vec![], + args: opt_args.unwrap_or_default(), }) }, ); diff --git a/src/query/ast/tests/it/parser.rs b/src/query/ast/tests/it/parser.rs index c0be807a150b..005a68fb413e 100644 --- a/src/query/ast/tests/it/parser.rs +++ b/src/query/ast/tests/it/parser.rs @@ -831,6 +831,7 @@ fn test_statement() { r#"drop PROCEDURE p1()"#, r#"drop PROCEDURE p1(int, string)"#, r#"call PROCEDURE p1()"#, + r#"call PROCEDURE p1(1, 'x', '2022-02-02'::Date)"#, r#"show PROCEDURES like 'p1%'"#, r#"create PROCEDURE p1() returns string not null language sql comment = 'test' as $$ BEGIN diff --git a/src/query/ast/tests/it/testdata/stmt.txt b/src/query/ast/tests/it/testdata/stmt.txt index 52e504ccf9ea..25e029ec6d15 100644 --- a/src/query/ast/tests/it/testdata/stmt.txt +++ b/src/query/ast/tests/it/testdata/stmt.txt @@ -23066,6 +23066,51 @@ CallProcedure( ) +---------- Input ---------- +call PROCEDURE p1(1, 'x', '2022-02-02'::Date) +---------- Output --------- +CALL PROCEDURE p1(1, 'x', '2022-02-02'::DATE) +---------- AST ------------ +CallProcedure( + CallProcedureStmt { + name: "p1", + args: [ + Literal { + span: Some( + 18..19, + ), + value: UInt64( + 1, + ), + }, + Literal { + span: Some( + 21..24, + ), + value: String( + "x", + ), + }, + Cast { + span: Some( + 38..44, + ), + expr: Literal { + span: Some( + 26..38, + ), + value: String( + "2022-02-02", + ), + }, + target_type: Date, + pg_style: true, + }, + ], + }, +) + + ---------- Input ---------- show PROCEDURES like 'p1%' ---------- Output --------- diff --git a/src/query/ee/tests/it/inverted_index/index_refresh.rs b/src/query/ee/tests/it/inverted_index/index_refresh.rs index c761b8292b2a..611e1ff11181 100644 --- a/src/query/ee/tests/it/inverted_index/index_refresh.rs +++ b/src/query/ee/tests/it/inverted_index/index_refresh.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::collections::BTreeMap; +use std::collections::HashSet; use databend_common_base::base::tokio; use databend_common_catalog::plan::InvertedIndexInfo; @@ -38,6 +39,7 @@ use databend_query::interpreters::RefreshTableIndexInterpreter; use databend_query::test_kits::append_string_sample_data; use databend_query::test_kits::*; use databend_storages_common_cache::LoadParams; +use tantivy::schema::IndexRecordOption; #[tokio::test(flavor = "multi_thread")] async fn test_fuse_do_refresh_inverted_index() -> Result<()> { @@ -144,14 +146,17 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { let field_nums = query_fields.len(); let has_score = true; let need_position = false; + let mut field_ids = HashSet::new(); + field_ids.insert(0); + field_ids.insert(1); + let index_record = IndexRecordOption::WithFreqsAndPositions; - let index_reader = - InvertedIndexReader::try_create(dal.clone(), field_nums, need_position, &index_loc).await?; + let index_reader = InvertedIndexReader::create(dal.clone()); let queries = vec![ ("rust".to_string(), vec![0, 1]), ("java".to_string(), vec![2]), - ("data".to_string(), vec![4, 1, 5]), + ("data".to_string(), vec![1, 4, 5]), ]; for (query_text, ids) in queries.into_iter() { @@ -166,14 +171,24 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { inverted_index_option: None, }; - let (query, tokenizer_manager) = create_inverted_index_query(&inverted_index_info)?; - - let matched_rows = index_reader.clone().do_filter( - has_score, - &query, - tokenizer_manager, - block_meta.row_count, - )?; + let (query, fuzziness, tokenizer_manager) = + create_inverted_index_query(&inverted_index_info)?; + + let matched_rows = index_reader + .clone() + .do_filter( + field_nums, + need_position, + has_score, + query.box_clone(), + &field_ids, + &index_record, + &fuzziness, + tokenizer_manager, + block_meta.row_count as u32, + &index_loc, + ) + .await?; assert!(matched_rows.is_some()); let matched_rows = matched_rows.unwrap(); assert_eq!(matched_rows.len(), ids.len()); diff --git a/src/query/ee/tests/it/inverted_index/pruning.rs b/src/query/ee/tests/it/inverted_index/pruning.rs index ea9009584722..504fe405fe06 100644 --- a/src/query/ee/tests/it/inverted_index/pruning.rs +++ b/src/query/ee/tests/it/inverted_index/pruning.rs @@ -707,7 +707,7 @@ async fn test_block_pruner() -> Result<()> { }), ..Default::default() }; - let e13 = PushDownInfo { + let _e13 = PushDownInfo { inverted_index: Some(InvertedIndexInfo { index_name: index_name.clone(), index_version: index_version.clone(), @@ -720,7 +720,7 @@ async fn test_block_pruner() -> Result<()> { }), ..Default::default() }; - let e14 = PushDownInfo { + let _e14 = PushDownInfo { inverted_index: Some(InvertedIndexInfo { index_name: index_name.clone(), index_version: index_version.clone(), @@ -733,7 +733,7 @@ async fn test_block_pruner() -> Result<()> { }), ..Default::default() }; - let e15 = PushDownInfo { + let _e15 = PushDownInfo { inverted_index: Some(InvertedIndexInfo { index_name: index_name.clone(), index_version: index_version.clone(), @@ -759,9 +759,9 @@ async fn test_block_pruner() -> Result<()> { (Some(e10), 2, 2), (Some(e11), 9, 15), (Some(e12), 2, 2), - (Some(e13), 3, 3), - (Some(e14), 2, 2), - (Some(e15), 2, 5), + //(Some(e13), 3, 3), + //(Some(e14), 2, 2), + //(Some(e15), 2, 5), ]; for (extra, expected_blocks, expected_rows) in extras { diff --git a/src/query/expression/src/kernels/mod.rs b/src/query/expression/src/kernels/mod.rs index c6485fb21310..1e5369e2d807 100644 --- a/src/query/expression/src/kernels/mod.rs +++ b/src/query/expression/src/kernels/mod.rs @@ -18,6 +18,7 @@ mod group_by; mod group_by_hash; mod scatter; mod sort; +mod sort_compare; mod take; mod take_chunks; mod take_compact; @@ -27,6 +28,7 @@ mod utils; pub use group_by_hash::*; pub use sort::*; +pub use sort_compare::*; pub use take_chunks::*; pub use topk::*; pub use utils::*; diff --git a/src/query/expression/src/kernels/sort.rs b/src/query/expression/src/kernels/sort.rs index 7e49e28e358b..b4cb346a4917 100644 --- a/src/query/expression/src/kernels/sort.rs +++ b/src/query/expression/src/kernels/sort.rs @@ -13,20 +13,16 @@ // limitations under the License. use std::cmp::Ordering; -use std::iter::once; use std::sync::Arc; use databend_common_arrow::arrow::array::ord as arrow_ord; use databend_common_arrow::arrow::array::ord::DynComparator; use databend_common_arrow::arrow::array::Array; use databend_common_arrow::arrow::array::PrimitiveArray; -use databend_common_arrow::arrow::compute::merge_sort as arrow_merge_sort; -use databend_common_arrow::arrow::compute::merge_sort::build_comparator_impl; use databend_common_arrow::arrow::compute::sort as arrow_sort; use databend_common_arrow::arrow::datatypes::DataType as ArrowType; use databend_common_arrow::arrow::error::Error as ArrowError; use databend_common_arrow::arrow::error::Result as ArrowResult; -use databend_common_exception::ErrorCode; use databend_common_exception::Result; use crate::converts::arrow2::ARROW_EXT_TYPE_EMPTY_ARRAY; @@ -34,10 +30,13 @@ use crate::converts::arrow2::ARROW_EXT_TYPE_EMPTY_MAP; use crate::converts::arrow2::ARROW_EXT_TYPE_VARIANT; use crate::types::DataType; use crate::utils::arrow::column_to_arrow_array; +use crate::visitor::ValueVisitor; use crate::Column; use crate::ColumnBuilder; use crate::DataBlock; use crate::Scalar; +use crate::SortCompare; +use crate::Value; pub type AbortChecker = Arc; @@ -54,6 +53,28 @@ pub struct SortColumnDescription { pub is_nullable: bool, } +#[derive(Copy, Clone, Debug)] +pub enum LimitType { + None, + LimitRows(usize), + LimitRank(usize), +} + +impl LimitType { + pub fn from_limit_rows(limit: Option) -> Self { + match limit { + Some(limit) => LimitType::LimitRows(limit), + None => LimitType::None, + } + } + pub fn limit_rows(&self, rows: usize) -> usize { + match self { + LimitType::LimitRows(limit) => *limit, + _ => rows, + } + } +} + #[derive(Clone, Debug)] pub struct SortField { pub data_type: DataType, @@ -80,6 +101,43 @@ impl DataBlock { block: &DataBlock, descriptions: &[SortColumnDescription], limit: Option, + ) -> Result { + let limit = if let Some(l) = limit { + LimitType::LimitRows(l) + } else { + LimitType::None + }; + + Self::sort_with_type(block, descriptions, limit) + } + + pub fn sort_with_type( + block: &DataBlock, + descriptions: &[SortColumnDescription], + limit: LimitType, + ) -> Result { + let num_rows = block.num_rows(); + if num_rows <= 1 || block.num_columns() == 0 { + return Ok(block.clone()); + } + let mut sort_compare = SortCompare::new(descriptions.to_owned(), num_rows, limit); + + for desc in descriptions.iter() { + let array = block.get_by_offset(desc.offset).value.clone(); + sort_compare.visit_value(array)?; + sort_compare.increment_column_index(); + } + + let permutations = sort_compare.take_permutation(); + DataBlock::take(block, &permutations, &mut None) + } + + // TODO remove these + #[allow(dead_code)] + pub fn sort_old( + block: &DataBlock, + descriptions: &[SortColumnDescription], + limit: Option, ) -> Result { let num_rows = block.num_rows(); if num_rows <= 1 { @@ -106,115 +164,6 @@ impl DataBlock { arrow_sort::lexsort_to_indices_impl(&order_arrays, limit, &build_compare)?; DataBlock::take(block, indices.values(), &mut None) } - - // merge two blocks to one sorted block - // require: lhs and rhs have been `convert_to_full`. - fn two_way_merge_sort( - blocks: &[DataBlock], - descriptions: &[SortColumnDescription], - limit: Option, - ) -> Result { - assert!(blocks.len() == 2); - - let lhs = &blocks[0]; - let rhs = &blocks[1]; - - let lhs_len = lhs.num_rows(); - let rhs_len = rhs.num_rows(); - if lhs_len == 0 { - return Ok(rhs.clone()); - } - if rhs_len == 0 { - return Ok(lhs.clone()); - } - - let mut sort_options = Vec::with_capacity(descriptions.len()); - let sort_arrays = descriptions - .iter() - .map(|d| { - let left = column_to_arrow_array(lhs.get_by_offset(d.offset), lhs_len); - let right = column_to_arrow_array(rhs.get_by_offset(d.offset), rhs_len); - sort_options.push(arrow_sort::SortOptions { - descending: !d.asc, - nulls_first: d.nulls_first, - }); - vec![left, right] - }) - .collect::>(); - - let sort_dyn_arrays = sort_arrays - .iter() - .map(|f| vec![f[0].as_ref(), f[1].as_ref()]) - .collect::>(); - - let sort_options_with_arrays = sort_dyn_arrays - .iter() - .zip(sort_options.iter()) - .map(|(arrays, opt)| (arrays as &[&dyn Array], opt)) - .collect::>(); - - let comparator = build_comparator_impl(&sort_options_with_arrays, &build_compare)?; - let lhs_slice = (0, 0, lhs_len); - let rhs_slice = (1, 0, rhs_len); - - let slices = - arrow_merge_sort::merge_sort_slices(once(&lhs_slice), once(&rhs_slice), &comparator) - .to_vec(limit); - - let block = DataBlock::take_by_slices_limit_from_blocks(blocks, &slices, limit); - Ok(block) - } - - pub fn merge_sort( - blocks: &[DataBlock], - descriptions: &[SortColumnDescription], - limit: Option, - abort_checker: AbortChecker, - ) -> Result { - match blocks.len() { - 0 => Result::Err(ErrorCode::EmptyData("Can't merge empty blocks")), - 1 => Ok(blocks[0].clone()), - 2 => { - if abort_checker.is_aborting() { - return Err(ErrorCode::AbortedQuery( - "Aborted query, because the server is shutting down or the query was killed.", - )); - } - - DataBlock::two_way_merge_sort(blocks, descriptions, limit) - } - _ => { - if abort_checker.is_aborting() { - return Err(ErrorCode::AbortedQuery( - "Aborted query, because the server is shutting down or the query was killed.", - )); - } - let left = DataBlock::merge_sort( - &blocks[0..blocks.len() / 2], - descriptions, - limit, - abort_checker.clone(), - )?; - if abort_checker.is_aborting() { - return Err(ErrorCode::AbortedQuery( - "Aborted query, because the server is shutting down or the query was killed.", - )); - } - let right = DataBlock::merge_sort( - &blocks[blocks.len() / 2..blocks.len()], - descriptions, - limit, - abort_checker.clone(), - )?; - if abort_checker.is_aborting() { - return Err(ErrorCode::AbortedQuery( - "Aborted query, because the server is shutting down or the query was killed.", - )); - } - DataBlock::two_way_merge_sort(&[left, right], descriptions, limit) - } - } - } } fn compare_variant(left: &dyn Array, right: &dyn Array) -> ArrowResult { @@ -290,20 +239,26 @@ pub fn compare_scalars(rows: Vec>, data_types: &[DataType]) -> Resul let order_columns = columns .into_iter() - .map(|builder| builder.build().as_arrow()) + .map(|builder| builder.build()) .collect::>(); - let order_arrays = order_columns + + let descriptions = order_columns .iter() - .map(|array| arrow_sort::SortColumn { - values: array.as_ref(), - options: Some(arrow_sort::SortOptions { - descending: false, - nulls_first: false, - }), + .enumerate() + .map(|(idx, array)| SortColumnDescription { + offset: idx, + asc: true, + nulls_first: false, + is_nullable: array.data_type().is_nullable(), }) .collect::>(); - let indices: PrimitiveArray = - arrow_sort::lexsort_to_indices_impl(&order_arrays, None, &build_compare)?; - Ok(indices.values().to_vec()) + let mut sort_compare = SortCompare::new(descriptions, length, LimitType::None); + + for array in order_columns { + sort_compare.visit_value(Value::Column(array))?; + sort_compare.increment_column_index(); + } + + Ok(sort_compare.take_permutation()) } diff --git a/src/query/expression/src/kernels/sort_compare.rs b/src/query/expression/src/kernels/sort_compare.rs new file mode 100644 index 000000000000..f492ba7178c9 --- /dev/null +++ b/src/query/expression/src/kernels/sort_compare.rs @@ -0,0 +1,329 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Ordering; +use std::ops::Range; + +use databend_common_arrow::arrow::bitmap::Bitmap; +use databend_common_arrow::arrow::buffer::Buffer; +use databend_common_exception::Result; +use memchr::memchr; + +use crate::types::AnyType; +use crate::types::NullableColumn; +use crate::types::Number; +use crate::types::ValueType; +use crate::visitor::ValueVisitor; +use crate::LimitType; +use crate::SortColumnDescription; + +pub struct SortCompare { + rows: usize, + limit: LimitType, + permutation: Vec, + ordering_descs: Vec, + current_column_index: usize, + validity: Option, + equality_index: Vec, +} + +macro_rules! do_sorter { + ($self: expr, $value:expr, $validity:expr, $g:expr, $c:expr, $ordering_desc:expr, $range: expr) => { + if let Some(valids) = &$validity { + $self.do_inner_sort( + |&a, &b| match (valids.get_bit(a as _), valids.get_bit(b as _)) { + (true, true) => { + let left = $g($value, a); + let right = $g($value, b); + + if $ordering_desc.asc { + $c(left, right) + } else { + $c(right, left) + } + } + (true, false) => { + if $ordering_desc.nulls_first { + Ordering::Greater + } else { + Ordering::Less + } + } + (false, true) => { + if $ordering_desc.nulls_first { + Ordering::Less + } else { + Ordering::Greater + } + } + (false, false) => Ordering::Equal, + }, + $range, + ); + } else { + if $ordering_desc.asc { + $self.do_inner_sort( + |&a, &b| { + let left = $g($value, a); + let right = $g($value, b); + $c(left, right) + }, + $range, + ); + } else { + $self.do_inner_sort( + |&a, &b| { + let left = $g($value, a); + let right = $g($value, b); + $c(right, left) + }, + $range, + ); + } + } + }; +} + +impl SortCompare { + pub fn new(ordering_descs: Vec, rows: usize, limit: LimitType) -> Self { + let equality_index = + if ordering_descs.len() == 1 && !matches!(limit, LimitType::LimitRank(_)) { + vec![] + } else { + vec![1; rows as _] + }; + Self { + rows, + limit, + permutation: (0..rows as u32).collect(), + ordering_descs, + current_column_index: 0, + validity: None, + equality_index, + } + } + + fn need_update_equality_index(&self) -> bool { + self.current_column_index != self.ordering_descs.len() - 1 + || matches!(self.limit, LimitType::LimitRank(_)) + } + + pub fn increment_column_index(&mut self) { + self.current_column_index += 1; + } + + pub fn take_permutation(mut self) -> Vec { + match self.limit { + LimitType::None => self.permutation, + LimitType::LimitRows(rows) => { + self.permutation.truncate(rows); + self.permutation + } + LimitType::LimitRank(rank_number) => { + let mut unique_count = 0; + + let mut start = 0; + // the index of last zero sign + let mut zero_index: isize = -1; + while start < self.rows { + // Find the first occurrence of 1 in the equality_index using memchr + if let Some(pos) = memchr(1, &self.equality_index[start..self.rows]) { + start += pos; + } else { + start = self.rows; + } + unique_count += (start as isize - zero_index) as usize; + + if unique_count > rank_number { + start -= unique_count - rank_number; + break; + } + + if start == self.rows { + break; + } + + // Find the first occurrence of 0 after the start position using memchr + if let Some(pos) = memchr(0, &self.equality_index[start..self.rows]) { + start += pos; + } else { + start = self.rows; + } + if unique_count == rank_number { + break; + } + zero_index = start as _; + } + + self.permutation.truncate(start); + self.permutation + } + } + } + + fn do_inner_sort(&mut self, c: C, range: Range) + where C: FnMut(&u32, &u32) -> Ordering + Copy { + let permutations = &mut self.permutation[range.start..range.end]; + + let limit = self.limit.limit_rows(self.rows); + if limit > range.start && limit < range.end { + let (p, _, _) = permutations.select_nth_unstable_by(limit - range.start, c); + p.sort_unstable_by(c); + } else { + permutations.sort_unstable_by(c); + } + } + + // sort the value using generic G and C + fn generic_sort(&mut self, value: V, g: G, c: C) + where + G: Fn(V, u32) -> T + Copy, + V: Copy, + C: Fn(T, T) -> Ordering + Copy, + { + let validity = self.validity.take(); + let ordering_desc = self.ordering_descs[self.current_column_index].clone(); + + // faster path for only one sort column + if self.ordering_descs.len() == 1 { + do_sorter!(self, value, validity, g, c, ordering_desc, 0..self.rows); + } else { + let mut current = 1; + let len = self.rows; + let need_update_equality_index = self.need_update_equality_index(); + + while current < len { + // Find the start of the next range of equal elements + let start = if let Some(pos) = memchr(1, &self.equality_index[current..len]) { + current + pos + } else { + len + }; + + if start == len { + break; + } + + // Find the end of the range of equal elements + let end = if let Some(pos) = memchr(0, &self.equality_index[start..len]) { + start + pos + } else { + len + }; + + let range = start - 1..end; + // Perform the inner sort on the found range + do_sorter!(self, value, validity, g, c, ordering_desc, range); + if need_update_equality_index { + // Update equality_index + for i in start..end { + let is_equal = if let Some(ref v) = validity { + let va = v.get_bit(self.permutation[i] as _); + let vb = v.get_bit(self.permutation[i - 1] as _); + if va && vb { + c( + g(value, self.permutation[i]), + g(value, self.permutation[i - 1]), + ) == Ordering::Equal + } else { + !va && !vb + } + } else { + c( + g(value, self.permutation[i]), + g(value, self.permutation[i - 1]), + ) == Ordering::Equal + }; + self.equality_index[i] &= u8::from(is_equal); + } + } + + current = end; + } + } + } +} + +impl ValueVisitor for SortCompare { + fn visit_scalar(&mut self, _scalar: crate::Scalar) -> Result<()> { + Ok(()) + } + + // faster path for numeric + fn visit_number(&mut self, column: Buffer) -> Result<()> { + let values = column.as_slice(); + self.generic_sort(values, |c, idx| c[idx as usize], |a: T, b: T| a.cmp(&b)); + Ok(()) + } + + fn visit_timestamp(&mut self, buffer: Buffer) -> Result<()> { + self.visit_number(buffer) + } + + fn visit_date(&mut self, buffer: Buffer) -> Result<()> { + self.visit_number(buffer) + } + + fn visit_typed_column(&mut self, col: T::Column) -> Result<()> { + self.generic_sort( + &col, + |c, idx| -> T::ScalarRef<'_> { unsafe { T::index_column_unchecked(c, idx as _) } }, + |a, b| T::compare(a, b), + ); + Ok(()) + } + + fn visit_nullable(&mut self, column: Box>) -> Result<()> { + if column.validity.unset_bits() > 0 { + self.validity = Some(column.validity.clone()); + } + self.visit_column(column.column.clone()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_take_permutation() { + let test_cases1 = vec![ + (12, LimitType::None, 0..12), + (12, LimitType::LimitRows(5), 0..5), + ]; + + let test_cases2 = vec![ + (12, LimitType::LimitRank(5), 0..11), + (12, LimitType::LimitRank(3), 0..6), + (12, LimitType::LimitRank(4), 0..7), + (12, LimitType::LimitRank(5), 0..11), + ]; + + for (c, limit, range) in test_cases1 { + let sort_compare = SortCompare::new(vec![], c, limit); + + let permutation = sort_compare.take_permutation(); + let result: Vec = range.map(|c| c as u32).collect(); + assert_eq!(permutation, result); + } + + for (c, limit, range) in test_cases2 { + let mut sort_compare = SortCompare::new(vec![], c, limit); + sort_compare.equality_index = vec![1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0]; + let permutation = sort_compare.take_permutation(); + let result: Vec = range.map(|c| c as u32).collect(); + assert_eq!(permutation, result); + } + } +} diff --git a/src/query/expression/src/types.rs b/src/query/expression/src/types.rs index b5d69f85737b..aa6b837bfa7d 100755 --- a/src/query/expression/src/types.rs +++ b/src/query/expression/src/types.rs @@ -44,22 +44,26 @@ use serde::Deserialize; use serde::Serialize; pub use self::any::AnyType; +pub use self::array::ArrayColumn; pub use self::array::ArrayType; +pub use self::binary::BinaryColumn; pub use self::binary::BinaryType; pub use self::bitmap::BitmapType; pub use self::boolean::BooleanType; pub use self::date::DateType; -pub use self::decimal::DecimalDataType; -pub use self::decimal::DecimalSize; +pub use self::decimal::*; pub use self::empty_array::EmptyArrayType; pub use self::empty_map::EmptyMapType; pub use self::generic::GenericType; +pub use self::geography::GeographyColumn; pub use self::geography::GeographyType; pub use self::map::MapType; pub use self::null::NullType; +pub use self::nullable::NullableColumn; pub use self::nullable::NullableType; pub use self::number::*; pub use self::number_class::*; +pub use self::string::StringColumn; pub use self::string::StringType; pub use self::timestamp::TimestampType; pub use self::variant::VariantType; @@ -387,46 +391,47 @@ pub trait ValueType: Debug + Clone + PartialEq + Sized + 'static { Self::column_len(col) * std::mem::size_of::() } - /// Compare two scalars and return the Ordering between them, some data types not support comparison. + /// This is default implementation yet it's not efficient. #[inline(always)] - fn compare(_: Self::ScalarRef<'_>, _: Self::ScalarRef<'_>) -> Option { - None + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + Self::upcast_scalar(Self::to_owned_scalar(lhs)) + .cmp(&Self::upcast_scalar(Self::to_owned_scalar(rhs))) } /// Equal comparison between two scalars, some data types not support comparison. #[inline(always)] fn equal(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { - matches!(Self::compare(left, right), Some(Ordering::Equal)) + matches!(Self::compare(left, right), Ordering::Equal) } /// Not equal comparison between two scalars, some data types not support comparison. #[inline(always)] fn not_equal(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { - !matches!(Self::compare(left, right), Some(Ordering::Equal)) + !matches!(Self::compare(left, right), Ordering::Equal) } /// Greater than comparison between two scalars, some data types not support comparison. #[inline(always)] fn greater_than(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { - matches!(Self::compare(left, right), Some(Ordering::Greater)) + matches!(Self::compare(left, right), Ordering::Greater) } /// Less than comparison between two scalars, some data types not support comparison. #[inline(always)] fn less_than(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { - matches!(Self::compare(left, right), Some(Ordering::Less)) + matches!(Self::compare(left, right), Ordering::Less) } /// Greater than or equal comparison between two scalars, some data types not support comparison. #[inline(always)] fn greater_than_equal(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { - !matches!(Self::compare(left, right), Some(Ordering::Less)) + !matches!(Self::compare(left, right), Ordering::Less) } /// Less than or equal comparison between two scalars, some data types not support comparison. #[inline(always)] fn less_than_equal(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { - !matches!(Self::compare(left, right), Some(Ordering::Greater)) + !matches!(Self::compare(left, right), Ordering::Greater) } } diff --git a/src/query/expression/src/types/any.rs b/src/query/expression/src/types/any.rs index 283d4e808cfb..b0ed227866ca 100755 --- a/src/query/expression/src/types/any.rs +++ b/src/query/expression/src/types/any.rs @@ -149,7 +149,7 @@ impl ValueType for AnyType { } #[inline(always)] - fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Option { - Some(lhs.cmp(&rhs)) + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + lhs.cmp(&rhs) } } diff --git a/src/query/expression/src/types/binary.rs b/src/query/expression/src/types/binary.rs index 8cbc2b83c225..7bd855a0ff34 100644 --- a/src/query/expression/src/types/binary.rs +++ b/src/query/expression/src/types/binary.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::iter::once; use std::marker::PhantomData; use std::ops::Range; @@ -168,6 +169,11 @@ impl ValueType for BinaryType { fn column_memory_size(col: &Self::Column) -> usize { col.data().len() + col.offsets().len() * 8 } + + #[inline(always)] + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + lhs.cmp(rhs) + } } impl ArgType for BinaryType { diff --git a/src/query/expression/src/types/bitmap.rs b/src/query/expression/src/types/bitmap.rs index 9ff7bd013c5c..1823941ba2b7 100644 --- a/src/query/expression/src/types/bitmap.rs +++ b/src/query/expression/src/types/bitmap.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::ops::Range; use super::binary::BinaryColumn; @@ -162,6 +163,11 @@ impl ValueType for BitmapType { fn column_memory_size(col: &Self::Column) -> usize { col.data().len() + col.offsets().len() * 8 } + + #[inline(always)] + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + lhs.cmp(rhs) + } } impl ArgType for BitmapType { diff --git a/src/query/expression/src/types/boolean.rs b/src/query/expression/src/types/boolean.rs index 63bbdf8b8a84..ed579f263fc9 100644 --- a/src/query/expression/src/types/boolean.rs +++ b/src/query/expression/src/types/boolean.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::ops::Range; use databend_common_arrow::arrow::bitmap::Bitmap; @@ -164,6 +165,11 @@ impl ValueType for BooleanType { builder.get(0) } + #[inline(always)] + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + lhs.cmp(&rhs) + } + #[inline(always)] fn equal(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { left == right diff --git a/src/query/expression/src/types/date.rs b/src/query/expression/src/types/date.rs index dc7b8d6611f0..4e171a329bca 100644 --- a/src/query/expression/src/types/date.rs +++ b/src/query/expression/src/types/date.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::fmt::Display; use std::io::Cursor; use std::ops::Range; @@ -185,6 +186,11 @@ impl ValueType for DateType { builder[0] } + #[inline(always)] + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + lhs.cmp(&rhs) + } + #[inline(always)] fn equal(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { left == right diff --git a/src/query/expression/src/types/decimal.rs b/src/query/expression/src/types/decimal.rs index 99ada93822e5..0271d2d7fcc8 100644 --- a/src/query/expression/src/types/decimal.rs +++ b/src/query/expression/src/types/decimal.rs @@ -181,6 +181,11 @@ impl ValueType for DecimalType { builder[0] } + #[inline(always)] + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + lhs.cmp(&rhs) + } + #[inline(always)] fn equal(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { left == right diff --git a/src/query/expression/src/types/empty_array.rs b/src/query/expression/src/types/empty_array.rs index 7cb868244058..15818aaa9742 100644 --- a/src/query/expression/src/types/empty_array.rs +++ b/src/query/expression/src/types/empty_array.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::ops::Range; use crate::property::Domain; @@ -161,6 +162,11 @@ impl ValueType for EmptyArrayType { std::mem::size_of::() } + #[inline(always)] + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + lhs.cmp(&rhs) + } + #[inline(always)] fn equal(_left: Self::ScalarRef<'_>, _right: Self::ScalarRef<'_>) -> bool { true diff --git a/src/query/expression/src/types/empty_map.rs b/src/query/expression/src/types/empty_map.rs index c7c07698ea4e..019cdc2d3705 100644 --- a/src/query/expression/src/types/empty_map.rs +++ b/src/query/expression/src/types/empty_map.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::ops::Range; use crate::property::Domain; @@ -160,6 +161,11 @@ impl ValueType for EmptyMapType { fn column_memory_size(_: &Self::Column) -> usize { std::mem::size_of::() } + + #[inline(always)] + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + lhs.cmp(&rhs) + } } impl ArgType for EmptyMapType { diff --git a/src/query/expression/src/types/generic.rs b/src/query/expression/src/types/generic.rs index 7c32f0480011..91f5e9fd9de7 100755 --- a/src/query/expression/src/types/generic.rs +++ b/src/query/expression/src/types/generic.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::ops::Range; use crate::property::Domain; @@ -149,6 +150,11 @@ impl ValueType for GenericType { fn column_memory_size(col: &Self::Column) -> usize { col.memory_size() } + + #[inline(always)] + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + lhs.cmp(&rhs) + } } impl ArgType for GenericType { diff --git a/src/query/expression/src/types/geography.rs b/src/query/expression/src/types/geography.rs index 492fdd734ac6..8c0d95e92c4e 100644 --- a/src/query/expression/src/types/geography.rs +++ b/src/query/expression/src/types/geography.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::fmt::Debug; use std::hash::Hash; use std::ops::Range; @@ -63,7 +64,7 @@ impl Geography { } } -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct GeographyRef<'a>(pub &'a [u8]); impl<'a> GeographyRef<'a> { @@ -233,8 +234,9 @@ impl ValueType for GeographyType { col.memory_size() } - fn compare(a: Self::ScalarRef<'_>, b: Self::ScalarRef<'_>) -> Option { - a.partial_cmp(&b) + #[inline(always)] + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + lhs.cmp(&rhs) } } diff --git a/src/query/expression/src/types/geometry.rs b/src/query/expression/src/types/geometry.rs index 7c2cb11b251a..67f1afc95ec9 100644 --- a/src/query/expression/src/types/geometry.rs +++ b/src/query/expression/src/types/geometry.rs @@ -167,6 +167,11 @@ impl ValueType for GeometryType { fn column_memory_size(col: &Self::Column) -> usize { col.data().len() + col.offsets().len() * 8 } + + #[inline(always)] + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + lhs.cmp(rhs) + } } impl ArgType for GeometryType { diff --git a/src/query/expression/src/types/null.rs b/src/query/expression/src/types/null.rs index 60bf6906281d..91bb41203993 100644 --- a/src/query/expression/src/types/null.rs +++ b/src/query/expression/src/types/null.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::ops::Range; use super::nullable::NullableDomain; @@ -167,6 +168,11 @@ impl ValueType for NullType { fn column_memory_size(_: &Self::Column) -> usize { std::mem::size_of::() } + + #[inline(always)] + fn compare(_: Self::ScalarRef<'_>, _: Self::ScalarRef<'_>) -> Ordering { + Ordering::Equal + } } impl ArgType for NullType { diff --git a/src/query/expression/src/types/nullable.rs b/src/query/expression/src/types/nullable.rs index 63b6c1ddd805..a5c291136dde 100755 --- a/src/query/expression/src/types/nullable.rs +++ b/src/query/expression/src/types/nullable.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::marker::PhantomData; use std::ops::Range; @@ -222,6 +223,17 @@ impl ValueType for NullableType { fn column_memory_size(col: &Self::Column) -> usize { col.memory_size() } + + // Null default lastly + #[inline(always)] + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + match (lhs, rhs) { + (Some(lhs), Some(rhs)) => T::compare(lhs, rhs), + (Some(_), None) => Ordering::Greater, + (None, Some(_)) => Ordering::Less, + (None, None) => Ordering::Equal, + } + } } impl ArgType for NullableType { diff --git a/src/query/expression/src/types/number.rs b/src/query/expression/src/types/number.rs index 0afe13311c63..73f9e8922ad0 100644 --- a/src/query/expression/src/types/number.rs +++ b/src/query/expression/src/types/number.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::fmt::Debug; use std::marker::PhantomData; use std::ops::Range; @@ -218,6 +219,11 @@ impl ValueType for NumberType { builder[0] } + #[inline(always)] + fn compare(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> Ordering { + left.cmp(&right) + } + #[inline(always)] fn equal(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { left == right diff --git a/src/query/expression/src/types/string.rs b/src/query/expression/src/types/string.rs index 11efef686b14..e7a62e9f1e3b 100644 --- a/src/query/expression/src/types/string.rs +++ b/src/query/expression/src/types/string.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::iter::once; use std::ops::Range; @@ -168,6 +169,11 @@ impl ValueType for StringType { col.data().len() + col.offsets().len() * 8 } + #[inline(always)] + fn compare(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> Ordering { + left.cmp(right) + } + #[inline(always)] fn equal(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { left == right diff --git a/src/query/expression/src/types/timestamp.rs b/src/query/expression/src/types/timestamp.rs index b2970b5eb862..dc903c346721 100644 --- a/src/query/expression/src/types/timestamp.rs +++ b/src/query/expression/src/types/timestamp.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::fmt::Display; use std::io::Cursor; use std::ops::Range; @@ -192,6 +193,11 @@ impl ValueType for TimestampType { builder[0] } + #[inline(always)] + fn compare(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> Ordering { + left.cmp(&right) + } + #[inline(always)] fn equal(left: Self::ScalarRef<'_>, right: Self::ScalarRef<'_>) -> bool { left == right diff --git a/src/query/expression/src/types/variant.rs b/src/query/expression/src/types/variant.rs index 557fab4004ee..1a3c8fac1845 100644 --- a/src/query/expression/src/types/variant.rs +++ b/src/query/expression/src/types/variant.rs @@ -179,8 +179,8 @@ impl ValueType for VariantType { } #[inline(always)] - fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Option { - Some(jsonb::compare(lhs, rhs).expect("unable to parse jsonb value")) + fn compare(lhs: Self::ScalarRef<'_>, rhs: Self::ScalarRef<'_>) -> Ordering { + jsonb::compare(lhs, rhs).expect("unable to parse jsonb value") } } diff --git a/src/query/expression/src/utils/mod.rs b/src/query/expression/src/utils/mod.rs index 3d65daf97acd..bb0ba0cd7967 100644 --- a/src/query/expression/src/utils/mod.rs +++ b/src/query/expression/src/utils/mod.rs @@ -23,6 +23,7 @@ pub mod filter_helper; pub mod serialize; pub mod udf_client; pub mod variant_transform; +pub mod visitor; use databend_common_arrow::arrow::bitmap::Bitmap; use databend_common_ast::Span; diff --git a/src/query/expression/src/utils/visitor.rs b/src/query/expression/src/utils/visitor.rs new file mode 100755 index 000000000000..c3f7929b4f08 --- /dev/null +++ b/src/query/expression/src/utils/visitor.rs @@ -0,0 +1,141 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_arrow::arrow::bitmap::Bitmap; +use databend_common_arrow::arrow::buffer::Buffer; +use databend_common_exception::Result; +use decimal::DecimalType; +use geometry::GeometryType; + +use crate::types::*; +use crate::*; + +pub trait ValueVisitor { + fn visit_scalar(&mut self, _scalar: Scalar) -> Result<()>; + + fn visit_null(&mut self, len: usize) -> Result<()> { + self.visit_typed_column::(len) + } + + fn visit_empty_array(&mut self, len: usize) -> Result<()> { + self.visit_typed_column::(len) + } + + fn visit_empty_map(&mut self, len: usize) -> Result<()> { + self.visit_typed_column::(len) + } + + fn visit_number( + &mut self, + column: as ValueType>::Column, + ) -> Result<()> { + self.visit_typed_column::>(column) + } + + fn visit_decimal(&mut self, column: Buffer) -> Result<()> { + self.visit_typed_column::>(column) + } + + fn visit_boolean(&mut self, bitmap: Bitmap) -> Result<()> { + self.visit_typed_column::(bitmap) + } + + fn visit_binary(&mut self, column: BinaryColumn) -> Result<()> { + self.visit_typed_column::(column) + } + + fn visit_string(&mut self, column: StringColumn) -> Result<()> { + self.visit_typed_column::(column) + } + + fn visit_timestamp(&mut self, buffer: Buffer) -> Result<()> { + self.visit_typed_column::(buffer) + } + + fn visit_date(&mut self, buffer: Buffer) -> Result<()> { + self.visit_typed_column::(buffer) + } + + fn visit_array(&mut self, column: Box>) -> Result<()> { + self.visit_typed_column::(Column::Array(column)) + } + + fn visit_map(&mut self, column: Box>) -> Result<()> { + self.visit_typed_column::(Column::Map(column)) + } + + fn visit_tuple(&mut self, columns: Vec) -> Result<()> { + self.visit_typed_column::(Column::Tuple(columns)) + } + + fn visit_bitmap(&mut self, column: BinaryColumn) -> Result<()> { + self.visit_typed_column::(column) + } + + fn visit_nullable(&mut self, column: Box>) -> Result<()> { + self.visit_typed_column::(Column::Nullable(column)) + } + + fn visit_variant(&mut self, column: BinaryColumn) -> Result<()> { + self.visit_typed_column::(column) + } + + fn visit_geometry(&mut self, column: BinaryColumn) -> Result<()> { + self.visit_typed_column::(column) + } + + fn visit_geography(&mut self, column: GeographyColumn) -> Result<()> { + self.visit_typed_column::(column) + } + + fn visit_typed_column(&mut self, column: ::Column) -> Result<()>; + + fn visit_value(&mut self, value: Value) -> Result<()> { + match value { + Value::Scalar(c) => self.visit_scalar(c), + Value::Column(c) => self.visit_column(c), + } + } + + fn visit_column(&mut self, column: Column) -> Result<()> { + match column { + Column::Null { len } => self.visit_null(len), + Column::EmptyArray { len } => self.visit_empty_array(len), + Column::EmptyMap { len } => self.visit_empty_map(len), + Column::Number(column) => { + with_number_type!(|NUM_TYPE| match column { + NumberColumn::NUM_TYPE(b) => self.visit_number(b), + }) + } + Column::Decimal(column) => { + with_decimal_type!(|DECIMAL_TYPE| match column { + DecimalColumn::DECIMAL_TYPE(b, _) => self.visit_decimal(b), + }) + } + Column::Boolean(bitmap) => self.visit_boolean(bitmap), + Column::Binary(column) => self.visit_binary(column), + Column::String(column) => self.visit_string(column), + Column::Timestamp(buffer) => self.visit_timestamp(buffer), + Column::Date(buffer) => self.visit_date(buffer), + Column::Array(column) => self.visit_array(column), + Column::Map(column) => self.visit_map(column), + Column::Tuple(columns) => self.visit_tuple(columns), + Column::Bitmap(column) => self.visit_bitmap(column), + Column::Nullable(column) => self.visit_nullable(column), + Column::Variant(column) => self.visit_variant(column), + Column::Geometry(column) => self.visit_geometry(column), + Column::Geography(column) => self.visit_geography(column), + } + } +} diff --git a/src/query/expression/src/values.rs b/src/query/expression/src/values.rs index 3c0813785305..685d948684a8 100755 --- a/src/query/expression/src/values.rs +++ b/src/query/expression/src/values.rs @@ -884,11 +884,29 @@ impl PartialOrd for Column { (Column::Geography(col1), Column::Geography(col2)) => { col1.iter().partial_cmp(col2.iter()) } - _ => None, + (a, b) => { + if a.len() != b.len() { + a.len().partial_cmp(&b.len()) + } else { + for (l, r) in AnyType::iter_column(a).zip(AnyType::iter_column(b)) { + match l.partial_cmp(&r) { + Some(Ordering::Equal) => {} + other => return other, + } + } + Some(Ordering::Equal) + } + } } } } +impl Ord for Column { + fn cmp(&self, other: &Self) -> Ordering { + self.partial_cmp(other).unwrap_or(Ordering::Equal) + } +} + impl PartialEq for Column { fn eq(&self, other: &Self) -> bool { self.partial_cmp(other) == Some(Ordering::Equal) diff --git a/src/query/expression/tests/it/sort.rs b/src/query/expression/tests/it/sort.rs index fcc148c38b8c..f9fc04d2dc78 100644 --- a/src/query/expression/tests/it/sort.rs +++ b/src/query/expression/tests/it/sort.rs @@ -12,15 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; use std::vec; use databend_common_exception::Result; +use databend_common_expression::block_debug::assert_block_value_eq; use databend_common_expression::types::decimal::*; use databend_common_expression::types::number::*; use databend_common_expression::types::StringType; -use databend_common_expression::AbortChecker; -use databend_common_expression::CheckAbort; use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::FromData; @@ -200,131 +198,11 @@ fn test_block_sort() -> Result<()> { entry.value ); } - } - - Ok(()) -} - -#[test] -fn test_blocks_merge_sort() -> Result<()> { - let blocks = vec![ - new_block(&[ - Int64Type::from_data(vec![4i64, 6]), - StringType::from_data(vec!["b2", "b1"]), - ]), - new_block(&[ - Int64Type::from_data(vec![2i64, 3]), - StringType::from_data(vec!["b4", "b3"]), - ]), - new_block(&[ - Int64Type::from_data(vec![1i64, 1]), - StringType::from_data(vec!["b6", "b5"]), - ]), - ]; - - // test cast: - // - name - // - sort descriptions - // - limit - // - expected cols - #[allow(clippy::type_complexity)] - let test_cases: Vec<( - String, - Vec, - Option, - Vec, - )> = vec![ - ( - "order by col1".to_string(), - vec![SortColumnDescription { - offset: 0, - asc: true, - nulls_first: false, - is_nullable: false, - }], - None, - vec![ - Int64Type::from_data(vec![1_i64, 1, 2, 3, 4, 6]), - StringType::from_data(vec!["b6", "b5", "b4", "b3", "b2", "b1"]), - ], - ), - ( - "order by col1 limit 4".to_string(), - vec![SortColumnDescription { - offset: 0, - asc: true, - nulls_first: false, - is_nullable: false, - }], - Some(4), - vec![ - Int64Type::from_data(vec![1_i64, 1, 2, 3]), - StringType::from_data(vec!["b6", "b5", "b4", "b3"]), - ], - ), - ( - "order by col2 desc".to_string(), - vec![SortColumnDescription { - offset: 1, - asc: false, - nulls_first: false, - is_nullable: false, - }], - None, - vec![ - Int64Type::from_data(vec![1_i64, 1, 2, 3, 4, 6]), - StringType::from_data(vec!["b6", "b5", "b4", "b3", "b2", "b1"]), - ], - ), - ( - "order by col1, col2 desc".to_string(), - vec![ - SortColumnDescription { - offset: 0, - asc: true, - nulls_first: false, - is_nullable: false, - }, - SortColumnDescription { - offset: 1, - asc: false, - nulls_first: false, - is_nullable: false, - }, - ], - None, - vec![ - Int64Type::from_data(vec![1_i64, 1, 2, 3, 4, 6]), - StringType::from_data(vec!["b6", "b5", "b4", "b3", "b2", "b1"]), - ], - ), - ]; - - struct NeverAbort; - impl CheckAbort for NeverAbort { - fn is_aborting(&self) -> bool { - false - } - fn try_check_aborting(&self) -> Result<()> { - Ok(()) - } - } - let aborting: AbortChecker = Arc::new(NeverAbort); - - for (name, sort_descs, limit, expected) in test_cases { - let res = DataBlock::merge_sort(&blocks, &sort_descs, limit, aborting.clone())?; - - for (entry, expect) in res.columns().iter().zip(expected.iter()) { - assert_eq!( - entry.value.as_column().unwrap(), - expect, - "{}: the column after sort is wrong, expect: {:?}, got: {:?}", - name, - expect, - entry.value - ); - } + // test new sort algorithm + let res = DataBlock::sort_old(&decimal_block, &sort_descs, Some(decimal_block.num_rows()))?; + let res_new = DataBlock::sort(&decimal_block, &sort_descs, None)?; + assert_block_value_eq(&res, &res_new); } Ok(()) diff --git a/src/query/pipeline/core/src/lib.rs b/src/query/pipeline/core/src/lib.rs index be296444ca25..8e0a3ab7dd59 100644 --- a/src/query/pipeline/core/src/lib.rs +++ b/src/query/pipeline/core/src/lib.rs @@ -14,6 +14,7 @@ #![feature(once_cell_try)] #![feature(variant_count)] +#![feature(associated_type_defaults)] #![allow(clippy::arc_with_non_send_sync)] #![allow(clippy::useless_asref)] diff --git a/src/query/pipeline/core/src/pipeline.rs b/src/query/pipeline/core/src/pipeline.rs index 7fe725ddd9de..9d74f386490e 100644 --- a/src/query/pipeline/core/src/pipeline.rs +++ b/src/query/pipeline/core/src/pipeline.rs @@ -33,8 +33,11 @@ use crate::finished_chain::FinishedCallbackChain; use crate::pipe::Pipe; use crate::pipe::PipeItem; use crate::processors::DuplicateProcessor; +use crate::processors::Exchange; use crate::processors::InputPort; +use crate::processors::MergePartitionProcessor; use crate::processors::OutputPort; +use crate::processors::PartitionProcessor; use crate::processors::PlanScope; use crate::processors::PlanScopeGuard; use crate::processors::ProcessorPtr; @@ -444,6 +447,55 @@ impl Pipeline { } } + pub fn exchange(&mut self, n: usize, exchange: Arc) { + if let Some(pipe) = self.pipes.last() { + if pipe.output_length < 1 { + return; + } + + let input_len = pipe.output_length; + let mut items = Vec::with_capacity(input_len); + + for _index in 0..input_len { + let input = InputPort::create(); + let outputs: Vec<_> = (0..n).map(|_| OutputPort::create()).collect(); + items.push(PipeItem::create( + PartitionProcessor::create(input.clone(), outputs.clone(), exchange.clone()), + vec![input], + outputs, + )); + } + + // partition data block + self.add_pipe(Pipe::create(input_len, input_len * n, items)); + + let mut reorder_edges = Vec::with_capacity(input_len * n); + for index in 0..input_len * n { + reorder_edges.push((index % n) * input_len + (index / n)); + } + + self.reorder_inputs(reorder_edges); + + let mut items = Vec::with_capacity(input_len); + for _index in 0..n { + let output = OutputPort::create(); + let inputs: Vec<_> = (0..input_len).map(|_| InputPort::create()).collect(); + items.push(PipeItem::create( + MergePartitionProcessor::create( + inputs.clone(), + output.clone(), + exchange.clone(), + ), + inputs, + vec![output], + )); + } + + // merge partition + self.add_pipe(Pipe::create(input_len * n, n, items)) + } + } + #[track_caller] pub fn set_on_init Result<()> + Send + Sync + 'static>(&mut self, f: F) { let location = std::panic::Location::caller(); diff --git a/src/query/pipeline/core/src/processors/mod.rs b/src/query/pipeline/core/src/processors/mod.rs index 61e6f7005a60..c3b0e1772a34 100644 --- a/src/query/pipeline/core/src/processors/mod.rs +++ b/src/query/pipeline/core/src/processors/mod.rs @@ -37,4 +37,7 @@ pub use profile::PlanScope; pub use profile::PlanScopeGuard; pub use resize_processor::create_resize_item; pub use resize_processor::ResizeProcessor; +pub use shuffle_processor::Exchange; +pub use shuffle_processor::MergePartitionProcessor; +pub use shuffle_processor::PartitionProcessor; pub use shuffle_processor::ShuffleProcessor; diff --git a/src/query/pipeline/core/src/processors/processor.rs b/src/query/pipeline/core/src/processors/processor.rs index 3112c02b513b..ce70053b80de 100644 --- a/src/query/pipeline/core/src/processors/processor.rs +++ b/src/query/pipeline/core/src/processors/processor.rs @@ -34,7 +34,7 @@ pub enum Event { Finished, } -#[derive(Clone)] +#[derive(Clone, Debug)] pub enum EventCause { Other, // Which input of the processor triggers the event diff --git a/src/query/pipeline/core/src/processors/shuffle_processor.rs b/src/query/pipeline/core/src/processors/shuffle_processor.rs index ebfeefb6479e..d998c22a2481 100644 --- a/src/query/pipeline/core/src/processors/shuffle_processor.rs +++ b/src/query/pipeline/core/src/processors/shuffle_processor.rs @@ -16,12 +16,27 @@ use std::any::Any; use std::sync::Arc; use databend_common_exception::Result; +use databend_common_expression::DataBlock; use crate::processors::Event; use crate::processors::EventCause; use crate::processors::InputPort; use crate::processors::OutputPort; use crate::processors::Processor; +use crate::processors::ProcessorPtr; + +pub enum MultiwayStrategy { + Random, + Custom, +} + +pub trait Exchange: Send + Sync + 'static { + const STRATEGY: MultiwayStrategy = MultiwayStrategy::Random; + + fn partition(&self, state: DataBlock, n: usize) -> Result>; + + fn multiway_pick(&self, partitions: &[Option]) -> Result; +} pub struct ShuffleProcessor { input2output: Vec, @@ -139,3 +154,203 @@ impl Processor for ShuffleProcessor { Ok(Event::NeedData) } } + +pub struct PartitionProcessor { + input: Arc, + outputs: Vec>, + + exchange: Arc, + input_data: Option, + partitioned_data: Vec>, +} + +impl PartitionProcessor { + pub fn create( + input: Arc, + outputs: Vec>, + exchange: Arc, + ) -> ProcessorPtr { + let partitioned_data = vec![None; outputs.len()]; + ProcessorPtr::create(Box::new(PartitionProcessor { + input, + outputs, + exchange, + partitioned_data, + input_data: None, + })) + } +} + +impl Processor for PartitionProcessor { + fn name(&self) -> String { + String::from("ShufflePartition") + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + let mut all_output_finished = true; + let mut all_data_pushed_output = true; + + for (index, output) in self.outputs.iter().enumerate() { + if output.is_finished() { + self.partitioned_data[index].take(); + continue; + } + + all_output_finished = false; + + if output.can_push() { + if let Some(block) = self.partitioned_data[index].take() { + output.push_data(Ok(block)); + + continue; + } + } + + if self.partitioned_data[index].is_some() { + all_data_pushed_output = false; + } + } + + if all_output_finished { + self.input.finish(); + return Ok(Event::Finished); + } + + if !all_data_pushed_output { + self.input.set_not_need_data(); + return Ok(Event::NeedConsume); + } + + if self.input.has_data() { + self.input_data = Some(self.input.pull_data().unwrap()?); + return Ok(Event::Sync); + } + + if self.input.is_finished() { + for output in &self.outputs { + output.finish(); + } + + return Ok(Event::Finished); + } + + self.input.set_need_data(); + Ok(Event::NeedData) + } + + fn process(&mut self) -> Result<()> { + if let Some(block) = self.input_data.take() { + let partitioned = self.exchange.partition(block, self.outputs.len())?; + + for (index, block) in partitioned.into_iter().enumerate() { + if block.is_empty() && block.get_meta().is_none() { + continue; + } + + self.partitioned_data[index] = Some(block); + } + } + + Ok(()) + } +} + +pub struct MergePartitionProcessor { + exchange: Arc, + + output: Arc, + inputs: Vec>, + inputs_data: Vec>, +} + +impl MergePartitionProcessor { + pub fn create( + inputs: Vec>, + output: Arc, + exchange: Arc, + ) -> ProcessorPtr { + let inputs_data = vec![None; inputs.len()]; + ProcessorPtr::create(Box::new(MergePartitionProcessor { + output, + inputs, + exchange, + inputs_data, + })) + } +} + +impl Processor for MergePartitionProcessor { + fn name(&self) -> String { + String::from("ShuffleMergePartition") + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + for input in &self.inputs { + input.finish(); + } + + return Ok(Event::Finished); + } + + if !self.output.can_push() { + return Ok(Event::NeedConsume); + } + + let mut all_inputs_finished = true; + let mut need_pick_block_to_push = matches!(T::STRATEGY, MultiwayStrategy::Custom); + + for (index, input) in self.inputs.iter().enumerate() { + if input.is_finished() { + continue; + } + + all_inputs_finished = false; + + if input.has_data() { + match T::STRATEGY { + MultiwayStrategy::Random => { + if self.output.can_push() { + self.output.push_data(Ok(input.pull_data().unwrap()?)); + } + } + MultiwayStrategy::Custom => { + if self.inputs_data[index].is_none() { + self.inputs_data[index] = Some(input.pull_data().unwrap()?); + } + } + } + } + + if self.inputs_data[index].is_none() { + need_pick_block_to_push = false; + } + + input.set_need_data(); + } + + if all_inputs_finished { + self.output.finish(); + return Ok(Event::Finished); + } + + if need_pick_block_to_push { + let pick_index = self.exchange.multiway_pick(&self.inputs_data)?; + + if let Some(block) = self.inputs_data[pick_index].take() { + self.output.push_data(Ok(block)); + return Ok(Event::NeedConsume); + } + } + + Ok(Event::NeedData) + } +} diff --git a/src/query/pipeline/transforms/src/processors/transforms/transform_sort_partial.rs b/src/query/pipeline/transforms/src/processors/transforms/transform_sort_partial.rs index 2a4b7f9261e2..664880d42fad 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/transform_sort_partial.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/transform_sort_partial.rs @@ -16,6 +16,7 @@ use std::sync::Arc; use databend_common_exception::Result; use databend_common_expression::DataBlock; +use databend_common_expression::LimitType; use databend_common_expression::SortColumnDescription; use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; @@ -25,13 +26,13 @@ use crate::processors::transforms::Transform; use crate::processors::transforms::Transformer; pub struct TransformSortPartial { - limit: Option, + limit: LimitType, sort_columns_descriptions: Arc>, } impl TransformSortPartial { pub fn new( - limit: Option, + limit: LimitType, sort_columns_descriptions: Arc>, ) -> Self { Self { @@ -43,7 +44,7 @@ impl TransformSortPartial { pub fn try_create( input: Arc, output: Arc, - limit: Option, + limit: LimitType, sort_columns_descriptions: Arc>, ) -> Result> { Ok(Transformer::create(input, output, TransformSortPartial { @@ -58,6 +59,6 @@ impl Transform for TransformSortPartial { const NAME: &'static str = "SortPartialTransform"; fn transform(&mut self, block: DataBlock) -> Result { - DataBlock::sort(&block, &self.sort_columns_descriptions, self.limit) + DataBlock::sort_with_type(&block, &self.sort_columns_descriptions, self.limit) } } diff --git a/src/query/script/src/executor.rs b/src/query/script/src/executor.rs index 2041fec37a46..a9ee96cf1fe1 100644 --- a/src/query/script/src/executor.rs +++ b/src/query/script/src/executor.rs @@ -35,7 +35,7 @@ pub trait Client { fn var_to_ast(&self, scalar: &Self::Var) -> Result; fn read_from_set(&self, block: &Self::Set, row: usize, col: &ColumnAccess) -> Result; - fn set_len(&self, block: &Self::Set) -> usize; + fn num_rows(&self, block: &Self::Set) -> usize; fn is_true(&self, scalar: &Self::Var) -> Result; } @@ -132,7 +132,7 @@ impl Executor { let cursor = Cursor { set: set.clone(), row: 0, - len: self.client.set_len(block), + len: self.client.num_rows(block), }; self.iters.insert(to_iter.clone(), cursor); } diff --git a/src/query/script/tests/it/main.rs b/src/query/script/tests/it/main.rs index af781da277f2..1489dde9bf16 100644 --- a/src/query/script/tests/it/main.rs +++ b/src/query/script/tests/it/main.rs @@ -689,7 +689,7 @@ impl Client for MockClient { Ok(var) } - fn set_len(&self, set: &Self::Set) -> usize { + fn num_rows(&self, set: &Self::Set) -> usize { set.data.len() } diff --git a/src/query/service/src/catalogs/default/mutable_catalog.rs b/src/query/service/src/catalogs/default/mutable_catalog.rs index d55ce4e96a92..a0568ab38df3 100644 --- a/src/query/service/src/catalogs/default/mutable_catalog.rs +++ b/src/query/service/src/catalogs/default/mutable_catalog.rs @@ -687,7 +687,17 @@ impl Catalog for MutableCatalog { } async fn get_sequence(&self, req: GetSequenceReq) -> Result { - Ok(self.ctx.meta.get_sequence(req).await?) + let seq_meta = self.ctx.meta.get_sequence(&req.ident).await?; + + let Some(seq_meta) = seq_meta else { + return Err(KVAppError::AppError(AppError::SequenceError( + req.ident.unknown_error(func_name!()).into(), + )) + .into()); + }; + Ok(GetSequenceReply { + meta: seq_meta.data, + }) } async fn get_sequence_next_value( diff --git a/src/query/service/src/interpreters/access/privilege_access.rs b/src/query/service/src/interpreters/access/privilege_access.rs index 37013d5a0e70..e0577cea1669 100644 --- a/src/query/service/src/interpreters/access/privilege_access.rs +++ b/src/query/service/src/interpreters/access/privilege_access.rs @@ -1260,6 +1260,7 @@ impl AccessChecker for PrivilegeAccess { Plan::DescDatamaskPolicy(_) => {} Plan::Begin => {} Plan::ExecuteImmediate(_) + | Plan::CallProcedure(_) | Plan::CreateProcedure(_) | Plan::DropProcedure(_) /*| Plan::ShowCreateProcedure(_) diff --git a/src/query/service/src/interpreters/interpreter_execute_immediate.rs b/src/query/service/src/interpreters/interpreter_execute_immediate.rs index 3b3ce89f83b8..94799c782d09 100644 --- a/src/query/service/src/interpreters/interpreter_execute_immediate.rs +++ b/src/query/service/src/interpreters/interpreter_execute_immediate.rs @@ -15,42 +15,24 @@ use std::sync::Arc; use databend_common_ast::ast::DeclareItem; -use databend_common_ast::ast::Expr; -use databend_common_ast::ast::FunctionCall; -use databend_common_ast::ast::Identifier; -use databend_common_ast::ast::Literal; use databend_common_ast::ast::ScriptStatement; -use databend_common_ast::ast::TypeName; use databend_common_ast::parser::run_parser; use databend_common_ast::parser::script::script_block; use databend_common_ast::parser::tokenize_sql; use databend_common_ast::parser::ParseMode; -use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::block_debug::box_render; -use databend_common_expression::types::decimal::DecimalScalar; -use databend_common_expression::types::decimal::MAX_DECIMAL256_PRECISION; -use databend_common_expression::types::NumberScalar; use databend_common_expression::types::StringType; -use databend_common_expression::with_integer_mapped_type; use databend_common_expression::DataBlock; -use databend_common_expression::DataSchemaRef; use databend_common_expression::FromData; -use databend_common_expression::Scalar; use databend_common_script::compile; -use databend_common_script::ir::ColumnAccess; -use databend_common_script::Client; use databend_common_script::Executor; use databend_common_script::ReturnValue; use databend_common_sql::plans::ExecuteImmediatePlan; -use databend_common_sql::Planner; use databend_common_storages_fuse::TableContext; -use futures::TryStreamExt; -use itertools::Itertools; -use serde_json::Value as JsonValue; +use crate::interpreters::util::ScriptClient; use crate::interpreters::Interpreter; -use crate::interpreters::InterpreterFactory; use crate::pipelines::PipelineBuildResult; use crate::sessions::QueryContext; @@ -139,295 +121,3 @@ impl Interpreter for ExecuteImmediateInterpreter { res.map_err(|err| err.display_with_sql(&self.plan.script)) } } - -#[derive(Debug, Clone)] -struct QueryResult { - schema: DataSchemaRef, - block: DataBlock, -} - -struct ScriptClient { - ctx: Arc, -} - -impl Client for ScriptClient { - type Var = Scalar; - type Set = QueryResult; - - async fn query(&self, query: &str) -> Result { - let ctx = self - .ctx - .get_current_session() - .create_query_context() - .await?; - - let mut planner = Planner::new(ctx.clone()); - let (plan, _) = planner.plan_sql(query).await?; - let interpreter = InterpreterFactory::get(ctx.clone(), &plan).await?; - let stream = interpreter.execute(ctx.clone()).await?; - let blocks = stream.try_collect::>().await?; - let schema = plan.schema(); - - let block = match blocks.len() { - 0 => DataBlock::empty_with_schema(schema.clone()), - 1 => blocks[0].clone(), - _ => DataBlock::concat(&blocks)?, - }; - - Ok(QueryResult { schema, block }) - } - - fn var_to_ast(&self, scalar: &Self::Var) -> Result { - let ast = match scalar { - Scalar::Number(v) => with_integer_mapped_type!(|NUM_TYPE| match v { - NumberScalar::NUM_TYPE(v) => Expr::Literal { - span: None, - value: Literal::Decimal256 { - value: (*v).into(), - precision: MAX_DECIMAL256_PRECISION, - scale: 0, - }, - }, - NumberScalar::Float32(v) => Expr::Literal { - span: None, - value: Literal::Float64(v.into_inner() as f64), - }, - NumberScalar::Float64(v) => Expr::Literal { - span: None, - value: Literal::Float64(v.into_inner()), - }, - }), - Scalar::Boolean(v) => Expr::Literal { - span: None, - value: Literal::Boolean(*v), - }, - Scalar::String(v) => Expr::Literal { - span: None, - value: Literal::String(v.clone()), - }, - Scalar::Tuple(v) => Expr::FunctionCall { - span: None, - func: FunctionCall { - distinct: false, - name: Identifier::from_name(None, "tuple"), - args: v - .iter() - .map(|x| self.var_to_ast(&x.to_owned())) - .collect::>>()?, - params: vec![], - window: None, - lambda: None, - }, - }, - Scalar::Array(v) => Expr::FunctionCall { - span: None, - func: FunctionCall { - distinct: false, - name: Identifier::from_name(None, "array"), - args: v - .iter() - .map(|x| self.var_to_ast(&x.to_owned())) - .collect::>>()?, - params: vec![], - window: None, - lambda: None, - }, - }, - Scalar::Decimal(DecimalScalar::Decimal128(v, size)) => Expr::Literal { - span: None, - value: Literal::Decimal256 { - value: (*v).into(), - precision: size.precision, - scale: size.scale, - }, - }, - Scalar::Decimal(DecimalScalar::Decimal256(v, size)) => Expr::Literal { - span: None, - value: Literal::Decimal256 { - value: *v, - precision: size.precision, - scale: size.scale, - }, - }, - Scalar::Map(v) => { - let col = v.as_tuple().unwrap(); - let keys = col[0] - .iter() - .map(|x| self.var_to_ast(&x.to_owned())) - .collect::>>()?; - let vals = col[1] - .iter() - .map(|x| self.var_to_ast(&x.to_owned())) - .collect::>>()?; - Expr::FunctionCall { - span: None, - func: FunctionCall { - distinct: false, - name: Identifier::from_name(None, "map"), - args: vec![ - Expr::FunctionCall { - span: None, - func: FunctionCall { - distinct: false, - name: Identifier::from_name(None, "array"), - args: keys, - params: vec![], - window: None, - lambda: None, - }, - }, - Expr::FunctionCall { - span: None, - func: FunctionCall { - distinct: false, - name: Identifier::from_name(None, "array"), - args: vals, - params: vec![], - window: None, - lambda: None, - }, - }, - ], - params: vec![], - window: None, - lambda: None, - }, - } - } - Scalar::Variant(v) => { - let value = jsonb::from_slice(v).unwrap(); - let json = JsonValue::from(value).to_string(); - Expr::FunctionCall { - span: None, - func: FunctionCall { - distinct: false, - name: Identifier::from_name(None, "parse_json"), - args: vec![Expr::Literal { - span: None, - value: Literal::String(json), - }], - params: vec![], - window: None, - lambda: None, - }, - } - } - Scalar::EmptyArray => Expr::FunctionCall { - span: None, - func: FunctionCall { - distinct: false, - name: Identifier::from_name(None, "array"), - args: vec![], - params: vec![], - window: None, - lambda: None, - }, - }, - Scalar::EmptyMap => Expr::FunctionCall { - span: None, - func: FunctionCall { - distinct: false, - name: Identifier::from_name(None, "map"), - args: vec![], - params: vec![], - window: None, - lambda: None, - }, - }, - Scalar::Date(v) => Expr::Cast { - span: None, - expr: Box::new(Expr::Literal { - span: None, - value: Literal::Decimal256 { - value: (*v).into(), - precision: MAX_DECIMAL256_PRECISION, - scale: 0, - }, - }), - target_type: TypeName::Date, - pg_style: false, - }, - Scalar::Timestamp(v) => Expr::Cast { - span: None, - expr: Box::new(Expr::Literal { - span: None, - value: Literal::Decimal256 { - value: (*v).into(), - precision: MAX_DECIMAL256_PRECISION, - scale: 0, - }, - }), - target_type: TypeName::Timestamp, - pg_style: false, - }, - Scalar::Null => Expr::Literal { - span: None, - value: Literal::Null, - }, - Scalar::Bitmap(_) | Scalar::Binary(_) | Scalar::Geometry(_) | Scalar::Geography(_) => { - return Err(ErrorCode::Unimplemented(format!( - "variable of type {} is not supported yet", - scalar.as_ref().infer_data_type() - ))); - } - }; - - Ok(ast) - } - - fn read_from_set(&self, set: &Self::Set, row: usize, col: &ColumnAccess) -> Result { - let offset = match col { - ColumnAccess::Position(offset) => *offset, - // TODO(andylokandy): name resolution - ColumnAccess::Name(name) => set - .schema - .fields() - .iter() - .position(|f| f.name() == name) - .ok_or_else(|| { - ErrorCode::ScriptExecutionError(format!( - "cannot find column with name {} in block, available columns: {}", - name, - set.schema - .fields() - .iter() - .map(|f| format!("'{}'", f.name())) - .join(", ") - )) - })?, - }; - let col = set.block.columns().get(offset).ok_or_else(|| { - ErrorCode::ScriptExecutionError(format!( - "cannot read column at offset {} from block with {} columns", - offset, - set.block.num_columns() - )) - })?; - let cell = col - .value - .index(row) - .ok_or_else(|| { - ErrorCode::ScriptExecutionError(format!( - "cannot read value at row {} from column with {} rows", - row, - set.block.num_rows(), - )) - })? - .to_owned(); - - Ok(cell) - } - - fn set_len(&self, set: &Self::Set) -> usize { - set.block.num_rows() - } - - fn is_true(&self, scalar: &Self::Var) -> Result { - match scalar { - Scalar::Boolean(v) => Ok(*v), - _ => Err(ErrorCode::ScriptExecutionError(format!( - "`is_true` called on non-boolean value {scalar}", - ))), - } - } -} diff --git a/src/query/service/src/interpreters/interpreter_factory.rs b/src/query/service/src/interpreters/interpreter_factory.rs index 7a082574f04e..d90edac923a9 100644 --- a/src/query/service/src/interpreters/interpreter_factory.rs +++ b/src/query/service/src/interpreters/interpreter_factory.rs @@ -51,6 +51,7 @@ use crate::interpreters::interpreter_notification_create::CreateNotificationInte use crate::interpreters::interpreter_notification_desc::DescNotificationInterpreter; use crate::interpreters::interpreter_notification_drop::DropNotificationInterpreter; use crate::interpreters::interpreter_presign::PresignInterpreter; +use crate::interpreters::interpreter_procedure_call::CallProcedureInterpreter; use crate::interpreters::interpreter_procedure_create::CreateProcedureInterpreter; use crate::interpreters::interpreter_procedure_drop::DropProcedureInterpreter; use crate::interpreters::interpreter_role_show::ShowRolesInterpreter; @@ -596,6 +597,10 @@ impl InterpreterFactory { ctx, *p.clone(), )?)), + Plan::CallProcedure(p) => Ok(Arc::new(CallProcedureInterpreter::try_create( + ctx, + *p.clone(), + )?)), // Plan::ShowCreateProcedure(_) => {} // // Plan::RenameProcedure(p) => Ok(Arc::new(RenameProcedureInterpreter::try_create( diff --git a/src/query/service/src/interpreters/interpreter_procedure_call.rs b/src/query/service/src/interpreters/interpreter_procedure_call.rs new file mode 100644 index 000000000000..8e264910c8eb --- /dev/null +++ b/src/query/service/src/interpreters/interpreter_procedure_call.rs @@ -0,0 +1,134 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_ast::ast::DeclareItem; +use databend_common_ast::ast::DeclareVar; +use databend_common_ast::ast::Identifier; +use databend_common_ast::ast::ScriptStatement; +use databend_common_ast::parser::run_parser; +use databend_common_ast::parser::script::script_block; +use databend_common_ast::parser::tokenize_sql; +use databend_common_ast::parser::ParseMode; +use databend_common_exception::Result; +use databend_common_expression::block_debug::box_render; +use databend_common_expression::types::StringType; +use databend_common_expression::DataBlock; +use databend_common_expression::FromData; +use databend_common_script::compile; +use databend_common_script::Executor; +use databend_common_script::ReturnValue; +use databend_common_sql::plans::CallProcedurePlan; +use databend_common_storages_fuse::TableContext; + +use crate::interpreters::util::ScriptClient; +use crate::interpreters::Interpreter; +use crate::pipelines::PipelineBuildResult; +use crate::sessions::QueryContext; + +#[derive(Debug)] +pub struct CallProcedureInterpreter { + ctx: Arc, + plan: CallProcedurePlan, +} + +impl CallProcedureInterpreter { + pub fn try_create(ctx: Arc, plan: CallProcedurePlan) -> Result { + Ok(CallProcedureInterpreter { ctx, plan }) + } +} + +#[async_trait::async_trait] +impl Interpreter for CallProcedureInterpreter { + fn name(&self) -> &str { + "ProcedureCall" + } + + fn is_ddl(&self) -> bool { + false + } + + #[fastrace::trace] + #[async_backtrace::framed] + async fn execute2(&self) -> Result { + let res: Result<_> = try { + let mut src = vec![]; + for (arg, arg_name) in self.plan.args.iter().zip(self.plan.arg_names.iter()) { + src.push(ScriptStatement::LetVar { + declare: DeclareVar { + span: None, + name: Identifier::from_name(None, arg_name), + default: arg.clone(), + }, + }); + } + let settings = self.ctx.get_settings(); + let sql_dialect = settings.get_sql_dialect()?; + let tokens = tokenize_sql(&self.plan.script)?; + let mut ast = run_parser( + &tokens, + sql_dialect, + ParseMode::Template, + false, + script_block, + )?; + + for declare in ast.declares { + match declare { + DeclareItem::Var(declare) => src.push(ScriptStatement::LetVar { declare }), + DeclareItem::Set(declare) => { + src.push(ScriptStatement::LetStatement { declare }) + } + } + } + src.append(&mut ast.body); + let compiled = compile(&src)?; + + let client = ScriptClient { + ctx: self.ctx.clone(), + }; + let mut executor = Executor::load(ast.span, client, compiled); + let script_max_steps = settings.get_script_max_steps()?; + let result = executor.run(script_max_steps as usize).await?; + + match result { + Some(ReturnValue::Var(scalar)) => { + PipelineBuildResult::from_blocks(vec![DataBlock::new_from_columns(vec![ + StringType::from_data(vec![scalar.to_string()]), + ])])? + } + Some(ReturnValue::Set(set)) => { + let rendered_table = box_render( + &set.schema, + &[set.block.clone()], + usize::MAX, + usize::MAX, + usize::MAX, + true, + )?; + let lines = rendered_table.lines().map(|x| x.to_string()).collect(); + PipelineBuildResult::from_blocks(vec![DataBlock::new_from_columns(vec![ + StringType::from_data(lines), + ])])? + } + None => PipelineBuildResult::from_blocks(vec![DataBlock::new_from_columns(vec![ + StringType::from_data(Vec::::new()), + ])])?, + } + }; + + res.map_err(|err| err.display_with_sql(&self.plan.script)) + } +} diff --git a/src/query/service/src/interpreters/interpreter_table_analyze.rs b/src/query/service/src/interpreters/interpreter_table_analyze.rs index 74e9f2a40925..f3a5d2b53bcc 100644 --- a/src/query/service/src/interpreters/interpreter_table_analyze.rs +++ b/src/query/service/src/interpreters/interpreter_table_analyze.rs @@ -298,6 +298,7 @@ fn remove_exchange(plan: PhysicalPlan) -> PhysicalPlan { input: Box::new(traverse(*plan.input)), group_by: plan.group_by, agg_funcs: plan.agg_funcs, + rank_limit: plan.rank_limit, enable_experimental_aggregate_hashtable: plan .enable_experimental_aggregate_hashtable, group_by_display: plan.group_by_display, @@ -310,7 +311,6 @@ fn remove_exchange(plan: PhysicalPlan) -> PhysicalPlan { group_by: plan.group_by, agg_funcs: plan.agg_funcs, before_group_by_schema: plan.before_group_by_schema, - limit: plan.limit, group_by_display: plan.group_by_display, stat_info: plan.stat_info, }), diff --git a/src/query/service/src/interpreters/mod.rs b/src/query/service/src/interpreters/mod.rs index 1c2b738d48e0..d46cc641b8a5 100644 --- a/src/query/service/src/interpreters/mod.rs +++ b/src/query/service/src/interpreters/mod.rs @@ -72,6 +72,7 @@ mod interpreter_password_policy_drop; mod interpreter_presign; mod interpreter_privilege_grant; mod interpreter_privilege_revoke; +mod interpreter_procedure_call; mod interpreter_procedure_create; mod interpreter_procedure_drop; mod interpreter_replace; diff --git a/src/query/service/src/interpreters/util.rs b/src/query/service/src/interpreters/util.rs index 4a094d24554c..26b7e6789a57 100644 --- a/src/query/service/src/interpreters/util.rs +++ b/src/query/service/src/interpreters/util.rs @@ -12,9 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; + +use databend_common_ast::ast::Expr; +use databend_common_ast::parser::parse_expr; +use databend_common_ast::parser::tokenize_sql; +use databend_common_ast::parser::Dialect; +use databend_common_exception::ErrorCode; use databend_common_expression::ComputedExpr; +use databend_common_expression::DataBlock; +use databend_common_expression::DataSchemaRef; use databend_common_expression::Scalar; use databend_common_expression::TableSchemaRef; +use databend_common_script::ir::ColumnAccess; +use databend_common_script::Client; +use databend_common_sql::Planner; +use futures_util::TryStreamExt; +use itertools::Itertools; + +use crate::interpreters::InterpreterFactory; +use crate::sessions::QueryContext; #[allow(clippy::type_complexity)] pub fn generate_desc_schema( @@ -61,3 +78,109 @@ pub fn generate_desc_schema( } (names, types, nulls, default_exprs, extras) } + +#[derive(Debug, Clone)] +pub struct QueryResult { + pub(crate) schema: DataSchemaRef, + pub(crate) block: DataBlock, +} + +pub struct ScriptClient { + pub(crate) ctx: Arc, +} + +impl Client for ScriptClient { + type Var = Scalar; + type Set = QueryResult; + + async fn query(&self, query: &str) -> databend_common_exception::Result { + let ctx = self + .ctx + .get_current_session() + .create_query_context() + .await?; + + let mut planner = Planner::new(ctx.clone()); + let (plan, _) = planner.plan_sql(query).await?; + let interpreter = InterpreterFactory::get(ctx.clone(), &plan).await?; + let stream = interpreter.execute(ctx.clone()).await?; + let blocks = stream.try_collect::>().await?; + let schema = plan.schema(); + + let block = match blocks.len() { + 0 => DataBlock::empty_with_schema(schema.clone()), + 1 => blocks[0].clone(), + _ => DataBlock::concat(&blocks)?, + }; + + Ok(QueryResult { schema, block }) + } + + fn var_to_ast(&self, scalar: &Self::Var) -> databend_common_exception::Result { + let scalar = scalar.to_string(); + let ast = parse_expr(&tokenize_sql(&scalar)?, Dialect::PostgreSQL)?; + + Ok(ast) + } + + fn read_from_set( + &self, + set: &Self::Set, + row: usize, + col: &ColumnAccess, + ) -> databend_common_exception::Result { + let offset = match col { + ColumnAccess::Position(offset) => *offset, + // TODO(andylokandy): name resolution + ColumnAccess::Name(name) => set + .schema + .fields() + .iter() + .position(|f| f.name() == name) + .ok_or_else(|| { + ErrorCode::ScriptExecutionError(format!( + "cannot find column with name {} in block, available columns: {}", + name, + set.schema + .fields() + .iter() + .map(|f| format!("'{}'", f.name())) + .join(", ") + )) + })?, + }; + let col = set.block.columns().get(offset).ok_or_else(|| { + ErrorCode::ScriptExecutionError(format!( + "cannot read column at offset {} from block with {} columns", + offset, + set.block.num_columns() + )) + })?; + let cell = col + .value + .index(row) + .ok_or_else(|| { + ErrorCode::ScriptExecutionError(format!( + "cannot read value at row {} from column with {} rows", + row, + set.block.num_rows(), + )) + })? + .to_owned(); + + Ok(cell) + } + + fn num_rows(&self, set: &Self::Set) -> usize { + set.block.num_rows() + } + + fn is_true(&self, scalar: &Self::Var) -> databend_common_exception::Result { + match scalar { + Scalar::Boolean(v) => Ok(*v), + _ => Err(ErrorCode::ScriptExecutionError(format!( + "`is_true` called on non-boolean value {scalar}", + ))), + } + } +} diff --git a/src/query/service/src/pipelines/builders/builder_aggregate.rs b/src/query/service/src/pipelines/builders/builder_aggregate.rs index 1ecf01b6b0f8..001d1bafa83a 100644 --- a/src/query/service/src/pipelines/builders/builder_aggregate.rs +++ b/src/query/service/src/pipelines/builders/builder_aggregate.rs @@ -23,10 +23,13 @@ use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_expression::HashMethodKind; use databend_common_expression::HashTableConfig; +use databend_common_expression::LimitType; +use databend_common_expression::SortColumnDescription; use databend_common_functions::aggregates::AggregateFunctionFactory; use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_core::query_spill_prefix; use databend_common_pipeline_transforms::processors::TransformPipelineHelper; +use databend_common_pipeline_transforms::processors::TransformSortPartial; use databend_common_sql::executor::physical_plans::AggregateExpand; use databend_common_sql::executor::physical_plans::AggregateFinal; use databend_common_sql::executor::physical_plans::AggregateFunctionDesc; @@ -111,7 +114,6 @@ impl PipelineBuilder { enable_experimental_aggregate_hashtable, self.is_exchange_neighbor, max_block_size as usize, - None, max_spill_io_requests as usize, )?; @@ -125,7 +127,7 @@ impl PipelineBuilder { let group_cols = ¶ms.group_columns; let schema_before_group_by = params.input_schema.clone(); - let sample_block = DataBlock::empty_with_schema(schema_before_group_by); + let sample_block = DataBlock::empty_with_schema(schema_before_group_by.clone()); let method = DataBlock::choose_hash_method(&sample_block, group_cols, efficiently_memory)?; // Need a global atomic to read the max current radix bits hint @@ -136,6 +138,28 @@ impl PipelineBuilder { .cluster_with_partial(true, self.ctx.get_cluster().nodes.len()) }; + // For rank limit, we can filter data using sort with rank before partial + if let Some(rank_limit) = &aggregate.rank_limit { + let sort_desc = rank_limit + .0 + .iter() + .map(|desc| { + let offset = schema_before_group_by.index_of(&desc.order_by.to_string())?; + Ok(SortColumnDescription { + offset, + asc: desc.asc, + nulls_first: desc.nulls_first, + is_nullable: schema_before_group_by.field(offset).is_nullable(), // This information is not needed here. + }) + }) + .collect::>>()?; + let sort_desc = Arc::new(sort_desc); + + self.main_pipeline.add_transformer(|| { + TransformSortPartial::new(LimitType::LimitRank(rank_limit.1), sort_desc.clone()) + }); + } + self.main_pipeline.add_transform(|input, output| { Ok(ProcessorPtr::create( match params.aggregate_functions.is_empty() { @@ -225,7 +249,6 @@ impl PipelineBuilder { enable_experimental_aggregate_hashtable, self.is_exchange_neighbor, max_block_size as usize, - aggregate.limit, max_spill_io_requests as usize, )?; @@ -292,7 +315,6 @@ impl PipelineBuilder { enable_experimental_aggregate_hashtable: bool, cluster_aggregator: bool, max_block_size: usize, - limit: Option, max_spill_io_requests: usize, ) -> Result> { let mut agg_args = Vec::with_capacity(agg_funcs.len()); @@ -335,7 +357,6 @@ impl PipelineBuilder { enable_experimental_aggregate_hashtable, cluster_aggregator, max_block_size, - limit, max_spill_io_requests, )?; diff --git a/src/query/service/src/pipelines/builders/builder_copy_into_table.rs b/src/query/service/src/pipelines/builders/builder_copy_into_table.rs index ecf20454f27f..361cb573b406 100644 --- a/src/query/service/src/pipelines/builders/builder_copy_into_table.rs +++ b/src/query/service/src/pipelines/builders/builder_copy_into_table.rs @@ -231,7 +231,7 @@ impl PipelineBuilder { let req = UpsertTableCopiedFileReq { file_info: copied_file_tree, ttl: Some(Duration::from_hours(expire_hours)), - fail_if_duplicated: !force, + insert_if_not_exists: !force, }; Some(req) } diff --git a/src/query/service/src/pipelines/builders/builder_insert_multi_table.rs b/src/query/service/src/pipelines/builders/builder_insert_multi_table.rs index 11d3d0378c97..a5164b49cb3a 100644 --- a/src/query/service/src/pipelines/builders/builder_insert_multi_table.rs +++ b/src/query/service/src/pipelines/builders/builder_insert_multi_table.rs @@ -19,6 +19,7 @@ use std::sync::Arc; use databend_common_catalog::catalog::CatalogManager; use databend_common_exception::Result; use databend_common_expression::DataSchema; +use databend_common_expression::LimitType; use databend_common_expression::SortColumnDescription; use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_core::DynTransformBuilder; @@ -230,7 +231,7 @@ impl PipelineBuilder { Ok(ProcessorPtr::create(TransformSortPartial::try_create( transform_input_port, transform_output_port, - None, + LimitType::None, sort_desc.clone(), )?)) }, diff --git a/src/query/service/src/pipelines/builders/builder_sort.rs b/src/query/service/src/pipelines/builders/builder_sort.rs index 5ae5aaa1b345..6c933b165b0b 100644 --- a/src/query/service/src/pipelines/builders/builder_sort.rs +++ b/src/query/service/src/pipelines/builders/builder_sort.rs @@ -16,6 +16,7 @@ use std::sync::Arc; use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; +use databend_common_expression::LimitType; use databend_common_expression::SortColumnDescription; use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_core::query_spill_prefix; @@ -197,7 +198,12 @@ impl SortPipelineBuilder { pub fn build_full_sort_pipeline(self, pipeline: &mut Pipeline) -> Result<()> { // Partial sort - pipeline.add_transformer(|| TransformSortPartial::new(self.limit, self.sort_desc.clone())); + pipeline.add_transformer(|| { + TransformSortPartial::new( + LimitType::from_limit_rows(self.limit), + self.sort_desc.clone(), + ) + }); self.build_merge_sort_pipeline(pipeline, false) } diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index ece829239a5b..0636fd308748 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; +use std::sync::atomic::AtomicUsize; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; @@ -22,21 +22,15 @@ use databend_common_expression::with_number_mapped_type; use databend_common_expression::SortColumnDescription; use databend_common_pipeline_core::processors::Processor; use databend_common_pipeline_core::processors::ProcessorPtr; -use databend_common_pipeline_core::query_spill_prefix; use databend_common_pipeline_core::Pipe; -use databend_common_pipeline_core::PipeItem; use databend_common_sql::executor::physical_plans::Window; use databend_common_sql::executor::physical_plans::WindowPartition; -use databend_common_storage::DataOperator; -use tokio::sync::Semaphore; use crate::pipelines::processors::transforms::FrameBound; -use crate::pipelines::processors::transforms::TransformWindowPartitionBucket; +use crate::pipelines::processors::transforms::TransformWindowPartitionCollect; use crate::pipelines::processors::transforms::TransformWindowPartitionScatter; -use crate::pipelines::processors::transforms::TransformWindowPartitionSort; -use crate::pipelines::processors::transforms::TransformWindowPartitionSpillReader; -use crate::pipelines::processors::transforms::TransformWindowPartitionSpillWriter; use crate::pipelines::processors::transforms::WindowFunctionInfo; +use crate::pipelines::processors::transforms::WindowSpillSettings; use crate::pipelines::processors::TransformWindow; use crate::pipelines::PipelineBuilder; @@ -45,7 +39,6 @@ impl PipelineBuilder { self.build_pipeline(&window.input)?; let input_schema = window.input.output_schema()?; - let partition_by = window .partition_by .iter() @@ -54,7 +47,6 @@ impl PipelineBuilder { Ok(offset) }) .collect::>>()?; - let order_by = window .order_by .iter() @@ -139,20 +131,29 @@ impl PipelineBuilder { Ok(()) } - pub(crate) fn build_window_partition_pipeline( + pub(crate) fn build_window_partition( &mut self, window_partition: &WindowPartition, ) -> Result<()> { self.build_pipeline(&window_partition.input)?; - let plan_schema = window_partition.output_schema()?; + let num_processors = self.main_pipeline.output_len(); + + // Settings. + let settings = self.ctx.get_settings(); + let num_partitions = settings.get_window_num_partitions()?; + let max_block_size = settings.get_max_block_size()? as usize; + let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; + let sort_spilling_batch_bytes = settings.get_sort_spilling_batch_bytes()?; + let enable_loser_tree = settings.get_enable_loser_tree_merge_sort()?; + let window_spill_settings = WindowSpillSettings::new(settings.clone(), num_processors)?; + let plan_schema = window_partition.output_schema()?; let partition_by = window_partition .partition_by .iter() .map(|index| plan_schema.index_of(&index.to_string())) .collect::>>()?; - let sort_desc = window_partition .order_by .iter() @@ -166,78 +167,58 @@ impl PipelineBuilder { }) }) .collect::>>()?; - - self.main_pipeline.add_transform(|input, output| { - Ok(ProcessorPtr::create( - TransformWindowPartitionScatter::try_create( - self.ctx.clone(), - input, - output, - partition_by.clone(), - )?, - )) - })?; - - let operator = DataOperator::instance().operator(); - let location_prefix = - query_spill_prefix(self.ctx.get_tenant().tenant_name(), &self.ctx.get_id()); - self.main_pipeline.add_transform(|input, output| { - Ok(ProcessorPtr::create( - TransformWindowPartitionSpillWriter::create( - self.ctx.clone(), - input, - output, - operator.clone(), - location_prefix.clone(), - ), - )) - })?; - - let input_nums = self.main_pipeline.output_len(); - let transform = TransformWindowPartitionBucket::create(input_nums)?; - - let inputs = transform.get_inputs(); - let output = transform.get_output(); - - self.main_pipeline - .add_pipe(Pipe::create(inputs.len(), 1, vec![PipeItem::create( - ProcessorPtr::create(Box::new(transform)), - inputs, - vec![output], - )])); - - self.main_pipeline.try_resize(input_nums)?; - - let max_spill_io_requests = self.settings.get_max_spill_io_requests()? as usize; - let semaphore = Arc::new(Semaphore::new(max_spill_io_requests)); - self.main_pipeline.add_transform(|input, output| { - TransformWindowPartitionSpillReader::create( - input, - output, - operator.clone(), - semaphore.clone(), - ) - })?; - - let block_size = self.settings.get_max_block_size()? as usize; - let sort_spilling_batch_bytes = self.ctx.get_settings().get_sort_spilling_batch_bytes()?; - let enable_loser_tree = self.ctx.get_settings().get_enable_loser_tree_merge_sort()?; let have_order_col = window_partition.after_exchange.unwrap_or(false); - self.main_pipeline.add_transform(|input, output| { - Ok(ProcessorPtr::create( - TransformWindowPartitionSort::try_create( - input, - output, - sort_desc.clone(), - plan_schema.clone(), - block_size, - sort_spilling_batch_bytes, - enable_loser_tree, - have_order_col, - )?, - )) - })?; + // 1. Build window partition scatter processors. + let mut pipe_items = Vec::with_capacity(num_processors); + for _ in 0..num_processors { + let processor = TransformWindowPartitionScatter::new( + num_processors, + num_partitions, + partition_by.clone(), + )?; + pipe_items.push(processor.into_pipe_item()); + } + self.main_pipeline.add_pipe(Pipe::create( + num_processors, + num_processors * num_processors, + pipe_items, + )); + + // 2. Build shuffle processor. + let mut rule = Vec::with_capacity(num_processors * num_processors); + for i in 0..num_processors * num_processors { + rule.push( + (i * num_processors + i / num_processors) % (num_processors * num_processors), + ); + } + self.main_pipeline.reorder_inputs(rule); + + // 3. Build window partition collect processors. + let processor_id = AtomicUsize::new(0); + let mut pipe_items = Vec::with_capacity(num_processors); + for _ in 0..num_processors { + let processor = TransformWindowPartitionCollect::new( + self.ctx.clone(), + processor_id.fetch_add(1, std::sync::atomic::Ordering::AcqRel), + num_processors, + num_partitions, + window_spill_settings.clone(), + sort_desc.clone(), + plan_schema.clone(), + max_block_size, + sort_block_size, + sort_spilling_batch_bytes, + enable_loser_tree, + have_order_col, + )?; + pipe_items.push(processor.into_pipe_item()); + } + self.main_pipeline.add_pipe(Pipe::create( + num_processors * num_processors, + num_processors, + pipe_items, + )); Ok(()) } diff --git a/src/query/service/src/pipelines/executor/executor_graph.rs b/src/query/service/src/pipelines/executor/executor_graph.rs index 3d66db20c40d..7a56e250ee5a 100644 --- a/src/query/service/src/pipelines/executor/executor_graph.rs +++ b/src/query/service/src/pipelines/executor/executor_graph.rs @@ -998,7 +998,16 @@ impl Debug for ExecutingGraph { write!( f, "{:?}", - Dot::with_config(&self.graph, &[Config::EdgeNoLabel]) + Dot::with_attr_getters( + &self.graph, + &[Config::EdgeNoLabel], + &|_, edge| format!( + "{} -> {}", + edge.weight().output_index, + edge.weight().input_index + ), + &|_, (_, _)| String::new(), + ) ) } } diff --git a/src/query/service/src/pipelines/pipeline_builder.rs b/src/query/service/src/pipelines/pipeline_builder.rs index 9b2c42e58e02..653324ffbdab 100644 --- a/src/query/service/src/pipelines/pipeline_builder.rs +++ b/src/query/service/src/pipelines/pipeline_builder.rs @@ -168,7 +168,7 @@ impl PipelineBuilder { PhysicalPlan::AggregateFinal(aggregate) => self.build_aggregate_final(aggregate), PhysicalPlan::Window(window) => self.build_window(window), PhysicalPlan::WindowPartition(window_partition) => { - self.build_window_partition_pipeline(window_partition) + self.build_window_partition(window_partition) } PhysicalPlan::Sort(sort) => self.build_sort(sort), PhysicalPlan::Limit(limit) => self.build_limit(limit), diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs index 9c1466184a77..f1dfb320c3d0 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregator_params.rs @@ -44,8 +44,6 @@ pub struct AggregatorParams { pub enable_experimental_aggregate_hashtable: bool, pub cluster_aggregator: bool, pub max_block_size: usize, - // Limit is push down to AggregatorTransform - pub limit: Option, pub max_spill_io_requests: usize, } @@ -59,7 +57,6 @@ impl AggregatorParams { enable_experimental_aggregate_hashtable: bool, cluster_aggregator: bool, max_block_size: usize, - limit: Option, max_spill_io_requests: usize, ) -> Result> { let mut states_offsets: Vec = Vec::with_capacity(agg_funcs.len()); @@ -80,7 +77,6 @@ impl AggregatorParams { enable_experimental_aggregate_hashtable, cluster_aggregator, max_block_size, - limit, max_spill_io_requests, })) } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs index 482bca8238f2..23598d0515c5 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_final.rs @@ -46,7 +46,6 @@ pub struct TransformFinalAggregate { method: Method, params: Arc, flush_state: PayloadFlushState, - reach_limit: bool, } impl TransformFinalAggregate { @@ -63,7 +62,6 @@ impl TransformFinalAggregate { method, params, flush_state: PayloadFlushState::default(), - reach_limit: false, }, )) } @@ -124,23 +122,11 @@ impl TransformFinalAggregate { let mut blocks = vec![]; self.flush_state.clear(); - let mut rows = 0; loop { if ht.merge_result(&mut self.flush_state)? { let mut cols = self.flush_state.take_aggregate_results(); cols.extend_from_slice(&self.flush_state.take_group_columns()); - rows += cols[0].len(); blocks.push(DataBlock::new_from_columns(cols)); - - if rows >= self.params.limit.unwrap_or(usize::MAX) { - log::info!( - "reach limit optimization in flush agg hashtable, current {}, total {}", - rows, - ht.len(), - ); - self.reach_limit = true; - break; - } } else { break; } @@ -162,10 +148,6 @@ where Method: HashMethodBounds const NAME: &'static str = "TransformFinalAggregate"; fn transform(&mut self, meta: AggregateMeta) -> Result> { - if self.reach_limit { - return Ok(vec![self.params.empty_result_block()]); - } - if self.params.enable_experimental_aggregate_hashtable { return Ok(vec![self.transform_agg_hashtable(meta)?]); } @@ -196,18 +178,8 @@ where Method: HashMethodBounds let (len, _) = keys_iter.size_hint(); let mut places = Vec::with_capacity(len); - let mut current_len = hash_cell.hashtable.len(); unsafe { for key in keys_iter { - if self.reach_limit { - let entry = hash_cell.hashtable.entry(key); - if let Some(entry) = entry { - let place = Into::::into(*entry.get()); - places.push(place); - } - continue; - } - match hash_cell.hashtable.insert_and_entry(key) { Ok(mut entry) => { let place = @@ -215,13 +187,6 @@ where Method: HashMethodBounds places.push(place); *entry.get_mut() = place.addr(); - - if let Some(limit) = self.params.limit { - current_len += 1; - if current_len >= limit { - self.reach_limit = true; - } - } } Err(entry) => { let place = Into::::into(*entry.get()); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs index 5a21e7d3c4f9..2065d6e7c4bb 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_final.rs @@ -40,7 +40,6 @@ pub struct TransformFinalGroupBy { method: Method, params: Arc, flush_state: PayloadFlushState, - reach_limit: bool, } impl TransformFinalGroupBy { @@ -57,7 +56,6 @@ impl TransformFinalGroupBy { method, params, flush_state: PayloadFlushState::default(), - reach_limit: false, }, )) } @@ -118,22 +116,10 @@ impl TransformFinalGroupBy { let mut blocks = vec![]; self.flush_state.clear(); - let mut rows = 0; loop { if ht.merge_result(&mut self.flush_state)? { let cols = self.flush_state.take_group_columns(); - rows += cols[0].len(); blocks.push(DataBlock::new_from_columns(cols)); - - if rows >= self.params.limit.unwrap_or(usize::MAX) { - log::info!( - "reach limit optimization in flush agg hashtable, current {}, total {}", - rows, - ht.len(), - ); - self.reach_limit = true; - break; - } } else { break; } @@ -155,10 +141,6 @@ where Method: HashMethodBounds const NAME: &'static str = "TransformFinalGroupBy"; fn transform(&mut self, meta: AggregateMeta) -> Result> { - if self.reach_limit { - return Ok(vec![self.params.empty_result_block()]); - } - if self.params.enable_experimental_aggregate_hashtable { return Ok(vec![self.transform_agg_hashtable(meta)?]); } @@ -167,7 +149,7 @@ where Method: HashMethodBounds let arena = Arc::new(Bump::new()); let mut hashtable = self.method.create_hash_table::<()>(arena)?; - 'merge_hashtable: for bucket_data in data { + for bucket_data in data { match bucket_data { AggregateMeta::Spilled(_) => unreachable!(), AggregateMeta::BucketSpilled(_) => unreachable!(), @@ -182,13 +164,6 @@ where Method: HashMethodBounds for key in keys_iter.iter() { let _ = hashtable.insert_and_entry(key); } - - if let Some(limit) = self.params.limit { - if hashtable.len() >= limit { - self.reach_limit = true; - break 'merge_hashtable; - } - } } } AggregateMeta::HashTable(payload) => unsafe { @@ -197,12 +172,6 @@ where Method: HashMethodBounds for key in payload.cell.hashtable.iter() { let _ = hashtable.insert_and_entry(key.key()); } - - if let Some(limit) = self.params.limit { - if hashtable.len() >= limit { - break 'merge_hashtable; - } - } }, AggregateMeta::AggregatePayload(_) => unreachable!(), AggregateMeta::AggregateSpilling(_) => unreachable!(), diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs index ca1025c9d70c..c9c0a9977341 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs @@ -28,7 +28,8 @@ use databend_common_storages_fuse::TableContext; use crate::pipelines::processors::transforms::hash_join::spill_common::get_hashes; use crate::pipelines::processors::HashJoinState; use crate::sessions::QueryContext; -use crate::spillers::SpillBuffer; +use crate::spillers::PartitionBuffer; +use crate::spillers::PartitionBufferFetchOption; use crate::spillers::Spiller; use crate::spillers::SpillerConfig; use crate::spillers::SpillerType; @@ -37,7 +38,8 @@ use crate::spillers::SpillerType; /// it is used for both build side and probe side. pub struct HashJoinSpiller { spiller: Spiller, - spill_buffer: SpillBuffer, + partition_buffer: PartitionBuffer, + partition_threshold: usize, join_type: JoinType, is_build_side: bool, func_ctx: FunctionContext, @@ -56,7 +58,7 @@ impl HashJoinSpiller { hash_keys: Vec, hash_method: HashMethodKind, spill_partition_bits: usize, - spill_buffer_threshold: usize, + partition_buffer_threshold: usize, is_build_side: bool, ) -> Result { // Create a Spiller for spilling build side data. @@ -72,13 +74,19 @@ impl HashJoinSpiller { }; let spiller = Spiller::create(ctx.clone(), operator, spill_config, spiller_type)?; - // Create a SpillBuffer to buffer data before spilling. - let spill_buffer = SpillBuffer::create(1 << spill_partition_bits, spill_buffer_threshold); + let num_partitions = (1 << spill_partition_bits) as usize; + // The memory threshold of each partition, we will spill the partition data + // if the partition memory size exceeds the threshold. + let partition_threshold = partition_buffer_threshold * 1024 * 1024 / num_partitions; + + // Create a PartitionBuffer to buffer data before spilling. + let partition_buffer = PartitionBuffer::create(num_partitions); let join_type = join_state.join_type(); Ok(Self { spiller, - spill_buffer, + partition_buffer, + partition_threshold, spill_partition_bits, hash_keys, hash_method, @@ -100,8 +108,8 @@ impl HashJoinSpiller { self.partition_data_block(&data_block, &join_type, self.spill_partition_bits)?; for (partition_id, data_block) in partition_data_blocks.into_iter().enumerate() { if !data_block.is_empty() { - self.spill_buffer - .add_partition_data(partition_id, data_block); + self.partition_buffer + .add_data_block(partition_id, data_block); } } Ok(()) @@ -111,11 +119,13 @@ impl HashJoinSpiller { pub(crate) async fn spill( &mut self, data_blocks: &[DataBlock], - partition_need_to_spill: Option<&HashSet>, + partition_need_to_spill: Option<&HashSet>, ) -> Result> { let join_type = self.join_type.clone(); let mut unspilled_data_blocks = vec![]; let data_block = DataBlock::concat(data_blocks)?; + let fetch_option = + PartitionBufferFetchOption::PickPartitionWithThreshold(self.partition_threshold); for (partition_id, data_block) in self .partition_data_block(&data_block, &join_type, self.spill_partition_bits)? .into_iter() @@ -123,16 +133,20 @@ impl HashJoinSpiller { { if !data_block.is_empty() { if let Some(partition_need_to_spill) = partition_need_to_spill - && !partition_need_to_spill.contains(&(partition_id as u8)) + && !partition_need_to_spill.contains(&(partition_id)) { unspilled_data_blocks.push(data_block); continue; } - self.spill_buffer - .add_partition_data(partition_id, data_block); - if let Some(data_block) = self.spill_buffer.pick_data_to_spill(partition_id)? { + self.partition_buffer + .add_data_block(partition_id, data_block); + if let Some(data_blocks) = self + .partition_buffer + .fetch_data_blocks(partition_id, &fetch_option)? + { + let data_block = DataBlock::concat(&data_blocks)?; self.spiller - .spill_with_partition(partition_id as u8, data_block) + .spill_with_partition(partition_id, data_block) .await?; } } @@ -141,13 +155,18 @@ impl HashJoinSpiller { } // Restore data blocks from SpillBuffer and spilled files. - pub(crate) async fn restore(&mut self, partition_id: u8) -> Result> { + pub(crate) async fn restore(&mut self, partition_id: usize) -> Result> { let mut data_blocks = vec![]; // 1. restore data from SpillBuffer. + let option = if self.can_pick_buffer() { + PartitionBufferFetchOption::PickPartitionWithThreshold(0) + } else { + PartitionBufferFetchOption::ReadPartition + }; if self.need_read_buffer() && let Some(buffer_blocks) = self - .spill_buffer - .read_partition_data(partition_id, self.can_pick_buffer()) + .partition_buffer + .fetch_data_blocks(partition_id, &option)? { data_blocks.extend(buffer_blocks); } @@ -217,9 +236,9 @@ impl HashJoinSpiller { Ok(hashes) } - pub(crate) fn spilled_partitions(&self) -> HashSet { + pub(crate) fn spilled_partitions(&self) -> HashSet { let mut partition_ids = self.spiller.spilled_partitions(); - for partition_id in self.spill_buffer.buffered_partitions() { + for partition_id in self.partition_buffer.partition_ids() { partition_ids.insert(partition_id); } partition_ids @@ -227,7 +246,7 @@ impl HashJoinSpiller { pub fn has_next_restore_file(&self) -> bool { self.next_restore_file < self.spiller.spilled_files().len() - || (self.next_restore_file == 0 && !self.spill_buffer.empty_partition(0)) + || (self.next_restore_file == 0 && !self.partition_buffer.is_partition_empty(0)) } pub fn reset_next_restore_file(&mut self) { diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs index 238cd010fa6c..747e6994a953 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs @@ -16,7 +16,6 @@ use std::cell::SyncUnsafeCell; use std::collections::HashMap; use std::collections::HashSet; use std::sync::atomic::AtomicBool; -use std::sync::atomic::AtomicU8; use std::sync::atomic::AtomicUsize; use std::sync::atomic::Ordering; use std::sync::Arc; @@ -108,13 +107,13 @@ pub struct HashJoinState { /// It record whether spill has happened. pub(crate) is_spill_happened: AtomicBool, /// Spilled partition set, it contains all spilled partition sets from all build processors. - pub(crate) spilled_partitions: RwLock>, + pub(crate) spilled_partitions: RwLock>, /// Spill partition bits, it is used to calculate the number of partitions. pub(crate) spill_partition_bits: usize, /// Spill buffer size threshold. pub(crate) spill_buffer_threshold: usize, /// The next partition id to be restored. - pub(crate) partition_id: AtomicU8, + pub(crate) partition_id: AtomicUsize, /// Whether need next round, if it is true, restore data from spilled data and start next round. pub(crate) need_next_round: AtomicBool, /// Send message to notify all build processors to next round. @@ -176,7 +175,7 @@ impl HashJoinState { spilled_partitions: Default::default(), continue_build_watcher, _continue_build_dummy_receiver, - partition_id: AtomicU8::new(0), + partition_id: AtomicUsize::new(0), need_next_round: AtomicBool::new(false), is_spill_happened: AtomicBool::new(false), enable_spill, @@ -234,7 +233,7 @@ impl HashJoinState { self.need_outer_scan() || self.need_mark_scan() } - pub fn add_spilled_partitions(&self, partitions: &HashSet) { + pub fn add_spilled_partitions(&self, partitions: &HashSet) { let mut spilled_partitions = self.spilled_partitions.write(); spilled_partitions.extend(partitions); } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_build.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_build.rs index 86ed772c035e..472fb67a4d12 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_build.rs @@ -374,7 +374,7 @@ impl TransformHashJoinBuild { self.is_spill_happened } - fn partition_to_restore(&self) -> u8 { + fn partition_to_restore(&self) -> usize { self.build_state .hash_join_state .partition_id diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_probe.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_probe.rs index b151e174cf5e..fb5f9a649b35 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_probe.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_probe.rs @@ -109,7 +109,7 @@ pub struct TransformHashJoinProbe { // The spiller is used to spill/restore data blocks. spiller: HashJoinSpiller, // The next partition id to restore. - partition_id_to_restore: u8, + partition_id_to_restore: usize, step: Step, step_logs: Vec, diff --git a/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs index 38a7c1541f73..67e9e0d21c4c 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs @@ -269,7 +269,7 @@ where R: Rows + Sync + Send + 'static async fn spill(&mut self, block: DataBlock) -> Result<()> { debug_assert!(self.num_merge >= 2 && self.batch_rows > 0); - let location = self.spiller.spill_block(block).await?; + let location = self.spiller.spill(block).await?; self.unmerged_blocks.push_back(vec![location].into()); Ok(()) @@ -346,7 +346,7 @@ where R: Rows + Sync + Send + 'static let mut spilled = VecDeque::new(); while let Some(block) = merger.async_next_block().await? { - let location = self.spiller.spill_block(block).await?; + let location = self.spiller.spill(block).await?; spilled.push_back(location); } @@ -622,24 +622,6 @@ mod tests { Ok(()) } - #[tokio::test(flavor = "multi_thread")] - async fn test_two_way_merge_sort() -> Result<()> { - let fixture = TestFixture::setup().await?; - let ctx = fixture.new_query_ctx().await?; - let (input, expected) = basic_test_data(None); - - test(ctx, input, expected, 4, 2, false, None).await - } - - #[tokio::test(flavor = "multi_thread")] - async fn test_two_way_merge_sort_with_memory_block() -> Result<()> { - let fixture = TestFixture::setup().await?; - let ctx = fixture.new_query_ctx().await?; - let (input, expected) = basic_test_data(None); - - test(ctx, input, expected, 4, 2, true, None).await - } - async fn basic_test( ctx: Arc, batch_rows: usize, diff --git a/src/query/service/src/pipelines/processors/transforms/window/mod.rs b/src/query/service/src/pipelines/processors/transforms/window/mod.rs index 34ff5991f6ec..fa7a8ff77610 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/mod.rs @@ -13,12 +13,11 @@ // limitations under the License. mod frame_bound; -mod partition_by; +mod partition; mod transform_window; mod window_function; pub use frame_bound::FrameBound; +pub use partition::*; pub use transform_window::TransformWindow; pub use window_function::WindowFunctionInfo; - -pub use self::partition_by::*; diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/mod.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs similarity index 67% rename from src/query/service/src/pipelines/processors/transforms/window/partition_by/mod.rs rename to src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs index f3b70b3eae20..05a06b00b2be 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs @@ -12,16 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod transform_window_partition_bucket; +mod transform_window_partition_collect; mod transform_window_partition_scatter; -mod transform_window_partition_sort; -mod transform_window_partition_spill_reader; -mod transform_window_partition_spill_writer; +mod window_partition_buffer; mod window_partition_meta; -pub use transform_window_partition_bucket::*; +pub use transform_window_partition_collect::*; pub use transform_window_partition_scatter::*; -pub use transform_window_partition_sort::*; -pub use transform_window_partition_spill_reader::*; -pub use transform_window_partition_spill_writer::*; +pub use window_partition_buffer::*; pub use window_partition_meta::*; diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs new file mode 100644 index 000000000000..0681acce0243 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -0,0 +1,322 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Some variables and functions are named and designed with reference to ClickHouse. +// - https://github.com/ClickHouse/ClickHouse/blob/master/src/Processors/Transforms/WindowTransform.h +// - https://github.com/ClickHouse/ClickHouse/blob/master/src/Processors/Transforms/WindowTransform.cpp + +use std::any::Any; +use std::collections::VecDeque; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_expression::DataSchemaRef; +use databend_common_expression::SortColumnDescription; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; +use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_core::PipeItem; +use databend_common_pipeline_transforms::processors::sort_merge; + +use super::WindowPartitionBuffer; +use super::WindowPartitionMeta; +use super::WindowSpillSettings; +use crate::sessions::QueryContext; + +#[derive(Debug, Clone, Copy)] +pub enum Step { + Sync(SyncStep), + Async(AsyncStep), + Finish, +} + +#[derive(Debug, Clone, Copy)] +pub enum SyncStep { + Collect, + Sort, +} + +#[derive(Debug, Clone, Copy)] +pub enum AsyncStep { + Spill, + Restore, +} + +pub struct TransformWindowPartitionCollect { + inputs: Vec>, + output: Arc, + + restored_data_blocks: Vec, + output_data_blocks: VecDeque, + + // The partition id is used to map the partition id to the new partition id. + partition_id: Vec, + // The buffer is used to control the memory usage of the window operator. + buffer: WindowPartitionBuffer, + + // Sort variables. + sort_desc: Vec, + schema: DataSchemaRef, + max_block_size: usize, + sort_spilling_batch_bytes: usize, + enable_loser_tree: bool, + have_order_col: bool, + + // Event variables. + step: Step, + is_collect_finished: bool, +} + +impl TransformWindowPartitionCollect { + #[allow(clippy::too_many_arguments)] + pub fn new( + ctx: Arc, + processor_id: usize, + num_processors: usize, + num_partitions: usize, + spill_settings: WindowSpillSettings, + sort_desc: Vec, + schema: DataSchemaRef, + max_block_size: usize, + sort_block_size: usize, + sort_spilling_batch_bytes: usize, + enable_loser_tree: bool, + have_order_col: bool, + ) -> Result { + let inputs = (0..num_processors).map(|_| InputPort::create()).collect(); + let output = OutputPort::create(); + + // Calculate the partition ids collected by the processor. + let partitions: Vec = (0..num_partitions) + .filter(|&partition| partition % num_processors == processor_id) + .collect(); + + // Map each partition id to new partition id. + let mut partition_id = vec![0; num_partitions]; + for (new_partition_id, partition) in partitions.iter().enumerate() { + partition_id[*partition] = new_partition_id; + } + + // Create the window partition buffer. + let buffer = + WindowPartitionBuffer::new(ctx, partitions.len(), sort_block_size, spill_settings)?; + + Ok(Self { + inputs, + output, + output_data_blocks: VecDeque::new(), + restored_data_blocks: Vec::new(), + partition_id, + buffer, + sort_desc, + schema, + max_block_size, + sort_spilling_batch_bytes, + enable_loser_tree, + have_order_col, + step: Step::Sync(SyncStep::Collect), + is_collect_finished: false, + }) + } + + pub fn into_pipe_item(self) -> PipeItem { + let inputs = self.inputs.clone(); + let outputs = vec![self.output.clone()]; + let processor_ptr = ProcessorPtr::create(Box::new(self)); + PipeItem::create(processor_ptr, inputs, outputs) + } + + fn next_step(&mut self, step: Step) -> Result { + let event = match step { + Step::Sync(_) => Event::Sync, + Step::Async(_) => Event::Async, + Step::Finish => { + for input in self.inputs.iter() { + input.finish(); + } + self.output.finish(); + Event::Finished + } + }; + self.step = step; + Ok(event) + } + + fn collect(&mut self) -> Result { + let mut finished_input = 0; + for input in self.inputs.iter() { + if input.is_finished() { + finished_input += 1; + continue; + } + + if input.has_data() { + Self::collect_data_block( + input.pull_data().unwrap()?, + &self.partition_id, + &mut self.buffer, + ); + } + + if input.is_finished() { + finished_input += 1; + } else { + input.set_need_data(); + } + } + + if finished_input == self.inputs.len() { + self.is_collect_finished = true; + } + + if self.need_spill() { + return self.next_step(Step::Async(AsyncStep::Spill)); + } + + if self.is_collect_finished { + self.next_step(Step::Async(AsyncStep::Restore)) + } else { + Ok(Event::NeedData) + } + } + + fn output(&mut self) -> Result { + if self.output.is_finished() { + return self.next_step(Step::Finish); + } + + if !self.output.can_push() { + return Ok(Event::NeedConsume); + } + + if self.need_spill() { + return self.next_step(Step::Async(AsyncStep::Spill)); + } + + if let Some(data_block) = self.output_data_blocks.pop_front() { + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + + if !self.buffer.is_empty() { + self.next_step(Step::Async(AsyncStep::Restore)) + } else { + self.next_step(Step::Finish) + } + } +} + +#[async_trait::async_trait] +impl Processor for TransformWindowPartitionCollect { + fn name(&self) -> String { + "TransformWindowPartitionCollect".to_string() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + match self.step { + Step::Sync(sync_step) => match sync_step { + SyncStep::Collect => self.collect(), + SyncStep::Sort => self.output(), + }, + Step::Async(async_step) => match async_step { + AsyncStep::Spill => { + if self.need_spill() { + self.next_step(Step::Async(AsyncStep::Spill)) + } else if !self.is_collect_finished { + self.collect() + } else { + self.output() + } + } + AsyncStep::Restore => { + if !self.restored_data_blocks.is_empty() { + self.next_step(Step::Sync(SyncStep::Sort)) + } else { + self.next_step(Step::Finish) + } + } + }, + Step::Finish => Ok(Event::Finished), + } + } + + fn process(&mut self) -> Result<()> { + match self.step { + Step::Sync(SyncStep::Sort) => { + let restored_data_blocks = std::mem::take(&mut self.restored_data_blocks); + + let data_blocks = restored_data_blocks + .into_iter() + .map(|data_block| DataBlock::sort(&data_block, &self.sort_desc, None)) + .collect::>>()?; + + let sorted_data_blocks = sort_merge( + self.schema.clone(), + self.max_block_size, + self.sort_desc.clone(), + data_blocks, + self.sort_spilling_batch_bytes, + self.enable_loser_tree, + self.have_order_col, + )?; + + self.output_data_blocks.extend(sorted_data_blocks); + } + _ => unreachable!(), + } + Ok(()) + } + + #[async_backtrace::framed] + async fn async_process(&mut self) -> Result<()> { + match &self.step { + Step::Async(AsyncStep::Spill) => self.buffer.spill().await?, + Step::Async(AsyncStep::Restore) => { + self.restored_data_blocks = self.buffer.restore().await?; + } + _ => unreachable!(), + } + Ok(()) + } +} + +impl TransformWindowPartitionCollect { + fn collect_data_block( + data_block: DataBlock, + partition_ids: &[usize], + buffer: &mut WindowPartitionBuffer, + ) { + if let Some(meta) = data_block + .get_owned_meta() + .and_then(WindowPartitionMeta::downcast_from) + { + for (partition_id, data_block) in meta.partitioned_data.into_iter() { + let partition_id = partition_ids[partition_id]; + buffer.add_data_block(partition_id, data_block); + } + } + } + + fn need_spill(&mut self) -> bool { + self.buffer.need_spill() + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_scatter.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_scatter.rs new file mode 100644 index 000000000000..9d17168be2a2 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_scatter.rs @@ -0,0 +1,185 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::VecDeque; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::group_hash_columns_slice; +use databend_common_expression::ColumnBuilder; +use databend_common_expression::DataBlock; +use databend_common_expression::Value; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; +use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_core::PipeItem; + +use super::WindowPartitionMeta; + +pub struct TransformWindowPartitionScatter { + input_port: Arc, + output_ports: Vec>, + input_data_blocks: VecDeque, + output_data_blocks: Vec>, + is_initialized: bool, + hash_keys: Vec, + num_processors: usize, + num_partitions: usize, +} + +impl TransformWindowPartitionScatter { + pub fn new( + num_processors: usize, + num_partitions: usize, + hash_keys: Vec, + ) -> Result { + let input_port = InputPort::create(); + let output_ports = (0..num_processors) + .map(|_| OutputPort::create()) + .collect::>(); + Ok(Self { + input_port, + output_ports, + input_data_blocks: VecDeque::new(), + output_data_blocks: vec![VecDeque::new(); num_processors], + is_initialized: false, + hash_keys, + num_processors, + num_partitions, + }) + } + + pub fn finish(&mut self) -> Result { + self.input_port.finish(); + for output_port in self.output_ports.iter() { + output_port.finish(); + } + Ok(Event::Finished) + } + + pub fn into_pipe_item(self) -> PipeItem { + let inputs = vec![self.input_port.clone()]; + let outputs = self.output_ports.clone(); + let processor_ptr = ProcessorPtr::create(Box::new(self)); + PipeItem::create(processor_ptr, inputs, outputs) + } +} + +impl Processor for TransformWindowPartitionScatter { + fn name(&self) -> String { + "TransformWindowPartitionScatter".to_string() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if !self.is_initialized { + self.is_initialized = true; + self.input_port.set_need_data(); + return Ok(Event::NeedData); + } + + let mut all_output_finished = true; + let mut need_consume = false; + for (index, output_port) in self.output_ports.iter().enumerate() { + if output_port.is_finished() { + continue; + } + all_output_finished = false; + + if !output_port.can_push() { + need_consume = true; + continue; + } + + if let Some(data_block) = self.output_data_blocks[index].pop_front() { + output_port.push_data(Ok(data_block)); + need_consume = true; + } + } + + if all_output_finished { + return self.finish(); + } + + if need_consume { + return Ok(Event::NeedConsume); + } + + if self.input_port.has_data() { + let data_block = self.input_port.pull_data().unwrap()?; + self.input_data_blocks.push_back(data_block); + return Ok(Event::Sync); + } + + if self.input_port.is_finished() { + return self.finish(); + } + + self.input_port.set_need_data(); + Ok(Event::NeedData) + } + + fn process(&mut self) -> Result<()> { + if let Some(data_block) = self.input_data_blocks.pop_front() { + let num_rows = data_block.num_rows(); + + // Extract the columns used for hash computation. + let hash_cols = self + .hash_keys + .iter() + .map(|&offset| { + let entry = data_block.get_by_offset(offset); + match &entry.value { + Value::Scalar(s) => { + ColumnBuilder::repeat(&s.as_ref(), num_rows, &entry.data_type).build() + } + Value::Column(c) => c.clone(), + } + }) + .collect::>(); + + // Compute the hash value for each row. + let mut hashes = vec![0u64; num_rows]; + group_hash_columns_slice(&hash_cols, &mut hashes); + + // Scatter the data block to different partitions. + let indices = hashes + .iter() + .map(|&hash| (hash % self.num_partitions as u64) as u8) + .collect::>(); + let scatter_blocks = DataBlock::scatter(&data_block, &indices, self.num_partitions)?; + + // Partition the data blocks to different processors. + let mut output_data_blocks = vec![vec![]; self.num_processors]; + for (partition_id, data_block) in scatter_blocks.into_iter().enumerate() { + output_data_blocks[partition_id % self.num_processors] + .push((partition_id, data_block)); + } + + // Union data blocks for each processor. + for (partition_id, partitioned_data) in output_data_blocks.into_iter().enumerate() { + let meta = WindowPartitionMeta::create(partitioned_data); + let data_block = DataBlock::empty_with_meta(meta); + self.output_data_blocks[partition_id].push_back(data_block); + } + } + Ok(()) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs new file mode 100644 index 000000000000..dd93b0016c3c --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs @@ -0,0 +1,337 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::runtime::GLOBAL_MEM_STAT; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_pipeline_core::query_spill_prefix; +use databend_common_settings::Settings; +use databend_common_storage::DataOperator; +use databend_common_storages_fuse::TableContext; + +use crate::sessions::QueryContext; +use crate::spillers::PartitionBuffer; +use crate::spillers::PartitionBufferFetchOption; +use crate::spillers::SpilledData; +use crate::spillers::Spiller; +use crate::spillers::SpillerConfig; +use crate::spillers::SpillerType; + +/// The `WindowPartitionBuffer` is used to control memory usage of Window operator. +pub struct WindowPartitionBuffer { + spiller: Spiller, + spill_settings: WindowSpillSettings, + partition_buffer: PartitionBuffer, + restored_partition_buffer: PartitionBuffer, + num_partitions: usize, + sort_block_size: usize, + can_spill: bool, + next_to_restore_partition_id: isize, + spilled_small_partitions: Vec>, + spilled_merged_partitions: Vec<(SpilledData, bool, bool)>, +} + +impl WindowPartitionBuffer { + pub fn new( + ctx: Arc, + num_partitions: usize, + sort_block_size: usize, + spill_settings: WindowSpillSettings, + ) -> Result { + // Create an inner `Spiller` to spill data. + let spill_config = SpillerConfig::create(query_spill_prefix( + ctx.get_tenant().tenant_name(), + &ctx.get_id(), + )); + let operator = DataOperator::instance().operator(); + let spiller = Spiller::create(ctx.clone(), operator, spill_config, SpillerType::Window)?; + + // Create a `PartitionBuffer` to store partitioned data. + let partition_buffer = PartitionBuffer::create(num_partitions); + let restored_partition_buffer = PartitionBuffer::create(num_partitions); + Ok(Self { + spiller, + spill_settings, + partition_buffer, + restored_partition_buffer, + num_partitions, + sort_block_size, + can_spill: false, + next_to_restore_partition_id: -1, + spilled_small_partitions: vec![Vec::new(); num_partitions], + spilled_merged_partitions: Vec::new(), + }) + } + + pub fn need_spill(&mut self) -> bool { + if !self.spill_settings.enable_spill || !self.can_spill { + return false; + } + self.out_of_memory_limit() + } + + pub fn out_of_memory_limit(&mut self) -> bool { + // Check if processor memory usage exceeds the threshold. + if self.partition_buffer.memory_size() + self.restored_partition_buffer.memory_size() + > self.spill_settings.processor_memory_threshold + { + return true; + } + + // Check if global memory usage exceeds the threshold. + let global_memory_usage = std::cmp::max(GLOBAL_MEM_STAT.get_memory_usage(), 0) as usize; + global_memory_usage > self.spill_settings.global_memory_threshold + } + + pub fn is_empty(&self) -> bool { + self.next_to_restore_partition_id + 1 >= self.num_partitions as isize + } + + pub fn add_data_block(&mut self, partition_id: usize, data_block: DataBlock) { + if data_block.is_empty() { + return; + } + self.partition_buffer + .add_data_block(partition_id, data_block); + self.can_spill = true; + } + + // Spill data blocks in the buffer. + pub async fn spill(&mut self) -> Result<()> { + let spill_unit_size = self.spill_settings.spill_unit_size; + + // Pick one partition from the last to the first to spill. + let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); + let next_to_restore_partition_id = (self.next_to_restore_partition_id + 1) as usize; + for partition_id in (next_to_restore_partition_id..self.num_partitions).rev() { + if !self.partition_buffer.is_partition_empty(partition_id) + && self.partition_buffer.partition_memory_size(partition_id) > spill_unit_size + { + if let Some(data_blocks) = self + .partition_buffer + .fetch_data_blocks(partition_id, &option)? + { + return self + .spiller + .spill_with_partition(partition_id, DataBlock::concat(&data_blocks)?) + .await; + } + } + } + + // If there is no partition with size greater than `spill_unit_size`, then merge partitions to spill. + let mut accumulated_bytes = 0; + let mut partitions_to_spill = Vec::new(); + for partition_id in (next_to_restore_partition_id..self.num_partitions).rev() { + if !self.partition_buffer.is_partition_empty(partition_id) { + let partition_memory_size = + self.partition_buffer.partition_memory_size(partition_id); + if let Some(data_blocks) = self + .partition_buffer + .fetch_data_blocks(partition_id, &option)? + { + let data_block = DataBlock::concat(&data_blocks)?; + partitions_to_spill.push((partition_id, data_block)); + accumulated_bytes += partition_memory_size; + } + if accumulated_bytes >= spill_unit_size { + break; + } + } + } + + if accumulated_bytes > 0 { + let spilled_data = self + .spiller + .spill_with_merged_partitions(partitions_to_spill) + .await?; + if let SpilledData::MergedPartition { + location, + partitions, + } = spilled_data + { + let index = self.spilled_merged_partitions.len(); + for partition in partitions.iter() { + self.spilled_small_partitions[partition.0].push(index); + } + self.spilled_merged_partitions.push(( + SpilledData::MergedPartition { + location, + partitions, + }, + false, + false, + )); + return Ok(()); + } + } + + self.can_spill = false; + + Ok(()) + } + + // Restore data blocks from buffer and spilled files. + pub async fn restore(&mut self) -> Result> { + while self.next_to_restore_partition_id + 1 < self.num_partitions as isize { + self.next_to_restore_partition_id += 1; + let partition_id = self.next_to_restore_partition_id as usize; + // Restore large partitions from spilled files. + let mut result = self.spiller.read_spilled_partition(&partition_id).await?; + + // Restore small merged partitions from spilled files. + let spilled_small_partitions = + std::mem::take(&mut self.spilled_small_partitions[partition_id]); + for index in spilled_small_partitions { + let out_of_memory_limit = self.out_of_memory_limit(); + let (merged_partitions, restored, partial_restored) = + &mut self.spilled_merged_partitions[index]; + if *restored { + continue; + } + if let SpilledData::MergedPartition { + location, + partitions, + } = merged_partitions + { + if out_of_memory_limit || *partial_restored { + if let Some(pos) = partitions.iter().position(|p| p.0 == partition_id) { + let data_range = &partitions[pos].1; + let columns_layout = &partitions[pos].2; + let data_block = self + .spiller + .read_range(location, data_range.clone(), columns_layout) + .await?; + self.restored_partition_buffer + .add_data_block(partition_id, data_block); + partitions.remove(pos); + *partial_restored = true; + } + } else { + let partitioned_data = self + .spiller + .read_merged_partitions(merged_partitions) + .await?; + for (partition_id, data_block) in partitioned_data.into_iter() { + self.restored_partition_buffer + .add_data_block(partition_id, data_block); + } + *restored = true; + } + } + } + + if !self.partition_buffer.is_partition_empty(partition_id) { + let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); + if let Some(data_blocks) = self + .partition_buffer + .fetch_data_blocks(partition_id, &option)? + { + result.extend(self.concat_data_blocks(data_blocks)?); + } + } + + if !self + .restored_partition_buffer + .is_partition_empty(partition_id) + { + let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); + if let Some(data_blocks) = self + .restored_partition_buffer + .fetch_data_blocks(partition_id, &option)? + { + result.extend(self.concat_data_blocks(data_blocks)?); + } + } + + if !result.is_empty() { + return Ok(result); + } + } + Ok(vec![]) + } + + fn concat_data_blocks(&self, data_blocks: Vec) -> Result> { + let mut num_rows = 0; + let mut result = Vec::new(); + let mut current_blocks = Vec::new(); + + for data_block in data_blocks.into_iter() { + num_rows += data_block.num_rows(); + current_blocks.push(data_block); + if num_rows >= self.sort_block_size { + result.push(DataBlock::concat(¤t_blocks)?); + num_rows = 0; + current_blocks.clear(); + } + } + + if !current_blocks.is_empty() { + result.push(DataBlock::concat(¤t_blocks)?); + } + + Ok(result) + } +} + +#[derive(Clone, Debug, Default)] +pub struct WindowSpillSettings { + enable_spill: bool, + global_memory_threshold: usize, + processor_memory_threshold: usize, + spill_unit_size: usize, +} + +impl WindowSpillSettings { + pub fn new(settings: Arc, num_threads: usize) -> Result { + let global_memory_ratio = + std::cmp::min(settings.get_window_partition_spilling_memory_ratio()?, 100) as f64 + / 100_f64; + + if global_memory_ratio == 0.0 { + return Ok(WindowSpillSettings { + enable_spill: false, + global_memory_threshold: usize::MAX, + processor_memory_threshold: usize::MAX, + spill_unit_size: 0, + }); + } + + let global_memory_threshold = match settings.get_max_memory_usage()? { + 0 => usize::MAX, + max_memory_usage => match global_memory_ratio { + mr if mr == 0_f64 => usize::MAX, + mr => (max_memory_usage as f64 * mr) as usize, + }, + }; + + let processor_memory_threshold = + settings.get_window_partition_spilling_bytes_threshold_per_proc()?; + let processor_memory_threshold = match processor_memory_threshold { + 0 => global_memory_threshold / num_threads, + bytes => bytes, + }; + + let spill_unit_size = settings.get_window_spill_unit_size_mb()? * 1024 * 1024; + + Ok(WindowSpillSettings { + enable_spill: true, + global_memory_threshold, + processor_memory_threshold, + spill_unit_size, + }) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_meta.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_meta.rs new file mode 100644 index 000000000000..8ff262a41924 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_meta.rs @@ -0,0 +1,69 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Debug; +use std::fmt::Formatter; + +use databend_common_expression::BlockMetaInfo; +use databend_common_expression::BlockMetaInfoPtr; +use databend_common_expression::DataBlock; + +pub struct WindowPartitionMeta { + // Each element in `partitioned_data` is (partition_id, data_block). + pub partitioned_data: Vec<(usize, DataBlock)>, +} + +impl WindowPartitionMeta { + pub fn create(partitioned_data: Vec<(usize, DataBlock)>) -> BlockMetaInfoPtr { + Box::new(WindowPartitionMeta { partitioned_data }) + } +} + +impl serde::Serialize for WindowPartitionMeta { + fn serialize(&self, _: S) -> Result + where S: serde::Serializer { + unreachable!("WindowPartitionMeta does not support exchanging between multiple nodes") + } +} + +impl<'de> serde::Deserialize<'de> for WindowPartitionMeta { + fn deserialize(_: D) -> Result + where D: serde::Deserializer<'de> { + unreachable!("WindowPartitionMeta does not support exchanging between multiple nodes") + } +} + +impl Debug for WindowPartitionMeta { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + f.debug_struct("WindowPartitionMeta").finish() + } +} + +impl BlockMetaInfo for WindowPartitionMeta { + fn typetag_deserialize(&self) { + unimplemented!("WindowPartitionMeta does not support exchanging between multiple nodes") + } + + fn typetag_name(&self) -> &'static str { + unimplemented!("WindowPartitionMeta does not support exchanging between multiple nodes") + } + + fn equals(&self, _: &Box) -> bool { + unimplemented!("Unimplemented equals for WindowPartitionMeta") + } + + fn clone_self(&self) -> Box { + unimplemented!("Unimplemented clone for WindowPartitionMeta") + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_bucket.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_bucket.rs deleted file mode 100644 index f7fb9bfff718..000000000000 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_bucket.rs +++ /dev/null @@ -1,288 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::collections::btree_map::Entry; -use std::collections::BTreeMap; -use std::sync::Arc; - -use databend_common_exception::Result; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; - -use super::WindowPartitionMeta; - -static SINGLE_LEVEL_BUCKET_NUM: isize = -1; - -struct InputPortState { - port: Arc, - bucket: isize, -} - -pub struct TransformWindowPartitionBucket { - inputs: Vec, - output: Arc, - working_bucket: isize, - pushing_bucket: isize, - initialized_all_inputs: bool, - buckets_blocks: BTreeMap>, -} - -impl TransformWindowPartitionBucket { - pub fn create(input_nums: usize) -> Result { - let mut inputs = Vec::with_capacity(input_nums); - - for _ in 0..input_nums { - inputs.push(InputPortState { - bucket: -1, - port: InputPort::create(), - }) - } - - Ok(TransformWindowPartitionBucket { - inputs, - output: OutputPort::create(), - working_bucket: 0, - pushing_bucket: 0, - initialized_all_inputs: false, - buckets_blocks: BTreeMap::new(), - }) - } - - pub fn get_inputs(&self) -> Vec> { - let mut inputs = Vec::with_capacity(self.inputs.len()); - - for input_state in &self.inputs { - inputs.push(input_state.port.clone()); - } - - inputs - } - - pub fn get_output(&self) -> Arc { - self.output.clone() - } - - fn initialize_all_inputs(&mut self) -> Result { - self.initialized_all_inputs = true; - - for index in 0..self.inputs.len() { - if self.inputs[index].port.is_finished() { - continue; - } - - if self.inputs[index].bucket > SINGLE_LEVEL_BUCKET_NUM { - continue; - } - - if !self.inputs[index].port.has_data() { - self.inputs[index].port.set_need_data(); - self.initialized_all_inputs = false; - continue; - } - - let data_block = self.inputs[index].port.pull_data().unwrap()?; - self.inputs[index].bucket = self.add_bucket(data_block)?; - - if self.inputs[index].bucket <= SINGLE_LEVEL_BUCKET_NUM { - self.inputs[index].port.set_need_data(); - self.initialized_all_inputs = false; - } - } - - Ok(self.initialized_all_inputs) - } - - fn add_bucket(&mut self, mut data_block: DataBlock) -> Result { - if let Some(block_meta) = data_block.get_meta() { - if let Some(block_meta) = WindowPartitionMeta::downcast_ref_from(block_meta) { - let (bucket, res) = match block_meta { - WindowPartitionMeta::Spilling(_) => unreachable!(), - WindowPartitionMeta::BucketSpilled(_) => unreachable!(), - WindowPartitionMeta::Partitioned { .. } => unreachable!(), - WindowPartitionMeta::Payload(p) => (p.bucket, p.bucket), - WindowPartitionMeta::Spilled(_) => { - let meta = data_block.take_meta().unwrap(); - - if let Some(WindowPartitionMeta::Spilled(buckets_payload)) = - WindowPartitionMeta::downcast_from(meta) - { - for bucket_payload in buckets_payload { - match self.buckets_blocks.entry(bucket_payload.bucket) { - Entry::Vacant(v) => { - v.insert(vec![DataBlock::empty_with_meta( - WindowPartitionMeta::create_bucket_spilled( - bucket_payload, - ), - )]); - } - Entry::Occupied(mut v) => { - v.get_mut().push(DataBlock::empty_with_meta( - WindowPartitionMeta::create_bucket_spilled( - bucket_payload, - ), - )); - } - }; - } - - return Ok(SINGLE_LEVEL_BUCKET_NUM); - } - - unreachable!() - } - }; - - match self.buckets_blocks.entry(bucket) { - Entry::Vacant(v) => { - v.insert(vec![data_block]); - } - Entry::Occupied(mut v) => { - v.get_mut().push(data_block); - } - }; - - return Ok(res); - } - } - - unreachable!() - } - - fn try_push_data_block(&mut self) -> bool { - while self.pushing_bucket < self.working_bucket { - if let Some(bucket_blocks) = self.buckets_blocks.remove(&self.pushing_bucket) { - let data_block = Self::convert_blocks(self.pushing_bucket, bucket_blocks); - self.output.push_data(Ok(data_block)); - self.pushing_bucket += 1; - return true; - } - - self.pushing_bucket += 1; - } - - false - } - - fn convert_blocks(bucket: isize, data_blocks: Vec) -> DataBlock { - let mut data = Vec::with_capacity(data_blocks.len()); - for mut data_block in data_blocks.into_iter() { - if let Some(block_meta) = data_block.take_meta() { - if let Some(block_meta) = WindowPartitionMeta::downcast_from(block_meta) { - data.push(block_meta); - } - } - } - - DataBlock::empty_with_meta(WindowPartitionMeta::create_partitioned(bucket, data)) - } -} - -#[async_trait::async_trait] -impl Processor for TransformWindowPartitionBucket { - fn name(&self) -> String { - String::from("TransformWindowPartitionBucket") - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if self.output.is_finished() { - for input_state in &self.inputs { - input_state.port.finish(); - } - - self.buckets_blocks.clear(); - return Ok(Event::Finished); - } - - if !self.initialized_all_inputs && !self.initialize_all_inputs()? { - return Ok(Event::NeedData); - } - - if !self.output.can_push() { - for input_state in &self.inputs { - input_state.port.set_not_need_data(); - } - - return Ok(Event::NeedConsume); - } - - let pushed_data_block = self.try_push_data_block(); - - loop { - let mut all_inputs_is_finished = true; - let mut all_ports_prepared_data = true; - - for index in 0..self.inputs.len() { - if self.inputs[index].port.is_finished() { - continue; - } - - all_inputs_is_finished = false; - if self.inputs[index].bucket > self.working_bucket { - continue; - } - - if !self.inputs[index].port.has_data() { - all_ports_prepared_data = false; - self.inputs[index].port.set_need_data(); - continue; - } - - let data_block = self.inputs[index].port.pull_data().unwrap()?; - self.inputs[index].bucket = self.add_bucket(data_block)?; - - if self.inputs[index].bucket <= self.working_bucket { - all_ports_prepared_data = false; - self.inputs[index].port.set_need_data(); - } - } - - if all_inputs_is_finished { - break; - } - - if !all_ports_prepared_data { - return Ok(Event::NeedData); - } - - self.working_bucket += 1; - } - - if pushed_data_block || self.try_push_data_block() { - return Ok(Event::NeedConsume); - } - - if let Some((bucket, bucket_blocks)) = self.buckets_blocks.pop_first() { - let data_block = Self::convert_blocks(bucket, bucket_blocks); - self.output.push_data(Ok(data_block)); - return Ok(Event::NeedConsume); - } - - self.output.finish(); - - Ok(Event::Finished) - } - - fn process(&mut self) -> Result<()> { - Ok(()) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_scatter.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_scatter.rs deleted file mode 100644 index fcb5ff27c9d9..000000000000 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_scatter.rs +++ /dev/null @@ -1,215 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::btree_map::Entry; -use std::collections::BTreeMap; -use std::sync::Arc; - -use databend_common_base::runtime::GLOBAL_MEM_STAT; -use databend_common_catalog::table_context::TableContext; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::group_hash_columns_slice; -use databend_common_expression::ColumnBuilder; -use databend_common_expression::DataBlock; -use databend_common_expression::Value; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_transforms::processors::AccumulatingTransform; -use databend_common_pipeline_transforms::processors::AccumulatingTransformer; - -use super::WindowPartitionMeta; -use crate::sessions::QueryContext; - -pub static PARTITION_COUNT: usize = 256; - -#[derive(Default)] -pub struct PartitionHashTable { - pub buckets_blocks: BTreeMap>, - hash_keys: Vec, - allocated_bytes: usize, -} - -impl PartitionHashTable { - pub fn new(hash_keys: Vec) -> Self { - Self { - buckets_blocks: BTreeMap::new(), - hash_keys, - allocated_bytes: 0, - } - } - - pub fn add_block(&mut self, block: DataBlock) -> Result<()> { - let num_rows = block.num_rows(); - - let hash_cols = self - .hash_keys - .iter() - .map(|&offset| { - let entry = block.get_by_offset(offset); - match &entry.value { - Value::Scalar(s) => { - ColumnBuilder::repeat(&s.as_ref(), num_rows, &entry.data_type).build() - } - Value::Column(c) => c.clone(), - } - }) - .collect::>(); - - let mut hashes = vec![0u64; num_rows]; - group_hash_columns_slice(&hash_cols, &mut hashes); - - let indices = hashes - .iter() - .map(|&hash| (hash % PARTITION_COUNT as u64) as u8) - .collect::>(); - let scatter_blocks = DataBlock::scatter(&block, &indices, PARTITION_COUNT)?; - debug_assert_eq!(scatter_blocks.len(), PARTITION_COUNT); - - for (bucket, block) in scatter_blocks.into_iter().enumerate() { - if !block.is_empty() { - self.allocated_bytes += block.memory_size(); - match self.buckets_blocks.entry(bucket) { - Entry::Vacant(v) => { - v.insert(vec![block]); - } - Entry::Occupied(mut v) => { - v.get_mut().push(block); - } - }; - } - } - - Ok(()) - } - - #[inline] - pub fn allocated_bytes(&self) -> usize { - self.allocated_bytes - } -} - -pub fn convert_to_partitions( - mut buckets_blocks: BTreeMap>, -) -> Result> { - let mut partitions = Vec::with_capacity(PARTITION_COUNT); - while let Some((bucket, blocks)) = buckets_blocks.pop_first() { - let payload = DataBlock::concat(&blocks)?; - partitions.push((bucket as isize, payload)); - } - - Ok(partitions) -} - -struct WindowPartitionSettings { - max_memory_usage: usize, - spilling_bytes_threshold_per_proc: usize, -} - -impl TryFrom> for WindowPartitionSettings { - type Error = ErrorCode; - - fn try_from(ctx: Arc) -> std::result::Result { - let settings = ctx.get_settings(); - let max_threads = settings.get_max_threads()? as usize; - let mut memory_ratio = - settings.get_window_partition_spilling_memory_ratio()? as f64 / 100_f64; - - if memory_ratio > 1_f64 { - memory_ratio = 1_f64; - } - - let max_memory_usage = match settings.get_max_memory_usage()? { - 0 => usize::MAX, - max_memory_usage => match memory_ratio { - x if x == 0_f64 => usize::MAX, - memory_ratio => (max_memory_usage as f64 * memory_ratio) as usize, - }, - }; - - Ok(WindowPartitionSettings { - max_memory_usage, - spilling_bytes_threshold_per_proc: match settings - .get_window_partition_spilling_bytes_threshold_per_proc()? - { - 0 => max_memory_usage / max_threads, - spilling_bytes_threshold_per_proc => spilling_bytes_threshold_per_proc, - }, - }) - } -} - -pub struct TransformWindowPartitionScatter { - hash_table: PartitionHashTable, - settings: WindowPartitionSettings, -} - -impl TransformWindowPartitionScatter { - pub fn try_create( - ctx: Arc, - input: Arc, - output: Arc, - hash_keys: Vec, - ) -> Result> { - let hash_table = PartitionHashTable::new(hash_keys.clone()); - - Ok(AccumulatingTransformer::create( - input, - output, - TransformWindowPartitionScatter { - hash_table, - settings: WindowPartitionSettings::try_from(ctx)?, - }, - )) - } -} - -impl AccumulatingTransform for TransformWindowPartitionScatter { - const NAME: &'static str = "TransformWindowPartitionScatter"; - - fn transform(&mut self, block: DataBlock) -> Result> { - self.hash_table.add_block(block)?; - - if self.hash_table.allocated_bytes() > self.settings.spilling_bytes_threshold_per_proc - || GLOBAL_MEM_STAT.get_memory_usage() as usize >= self.settings.max_memory_usage - { - let hash_table = std::mem::take(&mut self.hash_table); - let blocks = vec![DataBlock::empty_with_meta( - WindowPartitionMeta::create_spilling(hash_table.buckets_blocks), - )]; - - self.hash_table = PartitionHashTable::new(hash_table.hash_keys); - return Ok(blocks); - } - - Ok(vec![]) - } - - fn on_finish(&mut self, _output: bool) -> Result> { - let mut blocks = Vec::with_capacity(PARTITION_COUNT); - let hash_table = std::mem::take(&mut self.hash_table); - - let partitions = convert_to_partitions(hash_table.buckets_blocks)?; - for (bucket, block) in partitions.into_iter() { - if !block.is_empty() { - blocks.push(DataBlock::empty_with_meta( - WindowPartitionMeta::create_payload(bucket, block), - )); - } - } - - Ok(blocks) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_sort.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_sort.rs deleted file mode 100644 index 6f27c2da3545..000000000000 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_sort.rs +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::DataBlock; -use databend_common_expression::DataSchemaRef; -use databend_common_expression::SortColumnDescription; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_transforms::processors::sort_merge; -use databend_common_pipeline_transforms::processors::BlockMetaTransform; -use databend_common_pipeline_transforms::processors::BlockMetaTransformer; - -use crate::pipelines::processors::transforms::WindowPartitionMeta; - -pub struct TransformWindowPartitionSort { - sort_desc: Vec, - schema: DataSchemaRef, - block_size: usize, - sort_spilling_batch_bytes: usize, - enable_loser_tree: bool, - have_order_col: bool, -} - -impl TransformWindowPartitionSort { - pub fn try_create( - input: Arc, - output: Arc, - sort_desc: Vec, - schema: DataSchemaRef, - block_size: usize, - sort_spilling_batch_bytes: usize, - enable_loser_tree: bool, - have_order_col: bool, - ) -> Result> { - Ok(BlockMetaTransformer::create( - input, - output, - TransformWindowPartitionSort { - sort_desc, - schema, - block_size, - sort_spilling_batch_bytes, - enable_loser_tree, - have_order_col, - }, - )) - } -} - -impl BlockMetaTransform for TransformWindowPartitionSort { - const NAME: &'static str = "TransformWindowPartitionSort"; - - fn transform(&mut self, meta: WindowPartitionMeta) -> Result> { - if let WindowPartitionMeta::Partitioned { bucket, data } = meta { - let mut sort_blocks = Vec::with_capacity(data.len()); - for bucket_data in data { - match bucket_data { - WindowPartitionMeta::Spilled(_) => unreachable!(), - WindowPartitionMeta::BucketSpilled(_) => unreachable!(), - WindowPartitionMeta::Partitioned { .. } => unreachable!(), - WindowPartitionMeta::Spilling(_) => unreachable!(), - WindowPartitionMeta::Payload(p) => { - debug_assert!(bucket == p.bucket); - let sort_block = DataBlock::sort(&p.data, &self.sort_desc, None)?; - sort_blocks.push(sort_block); - } - } - } - - sort_blocks = sort_merge( - self.schema.clone(), - self.block_size, - self.sort_desc.clone(), - sort_blocks, - self.sort_spilling_batch_bytes, - self.enable_loser_tree, - self.have_order_col, - )?; - - return Ok(sort_blocks); - } - - Err(ErrorCode::Internal( - "TransformWindowPartitionSort only recv WindowPartitionMeta::Partitioned", - )) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs deleted file mode 100644 index e16359036432..000000000000 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs +++ /dev/null @@ -1,279 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::collections::VecDeque; -use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; - -use databend_common_base::runtime::profile::Profile; -use databend_common_base::runtime::profile::ProfileStatisticsName; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::arrow::deserialize_column; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::BlockMetaInfoPtr; -use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_core::processors::ProcessorPtr; -use itertools::Itertools; -use log::info; -use opendal::Operator; -use tokio::sync::Semaphore; - -use super::WindowPartitionMeta; -use super::WindowPayload; -use crate::pipelines::processors::transforms::window::partition_by::BucketSpilledWindowPayload; - -type DeserializingMeta = (WindowPartitionMeta, VecDeque>); - -pub struct TransformWindowPartitionSpillReader { - input: Arc, - output: Arc, - - operator: Operator, - semaphore: Arc, - deserialized_meta: Option, - reading_meta: Option, - deserializing_meta: Option, -} - -#[async_trait::async_trait] -impl Processor for TransformWindowPartitionSpillReader { - fn name(&self) -> String { - String::from("TransformWindowPartitionSpillReader") - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if self.output.is_finished() { - self.input.finish(); - return Ok(Event::Finished); - } - - if !self.output.can_push() { - self.input.set_not_need_data(); - return Ok(Event::NeedConsume); - } - - if let Some(deserialized_meta) = self.deserialized_meta.take() { - self.output - .push_data(Ok(DataBlock::empty_with_meta(deserialized_meta))); - return Ok(Event::NeedConsume); - } - - if self.deserializing_meta.is_some() { - self.input.set_not_need_data(); - return Ok(Event::Sync); - } - - if self.reading_meta.is_some() { - self.input.set_not_need_data(); - return Ok(Event::Async); - } - - if self.input.has_data() { - let mut data_block = self.input.pull_data().unwrap()?; - - if let Some(WindowPartitionMeta::Partitioned { data, .. }) = data_block - .get_meta() - .and_then(WindowPartitionMeta::downcast_ref_from) - { - if data - .iter() - .any(|meta| matches!(meta, WindowPartitionMeta::BucketSpilled(_))) - { - self.input.set_not_need_data(); - let block_meta = data_block.take_meta().unwrap(); - self.reading_meta = WindowPartitionMeta::downcast_from(block_meta); - return Ok(Event::Async); - } - } - - self.output.push_data(Ok(data_block)); - return Ok(Event::NeedConsume); - } - - if self.input.is_finished() { - self.output.finish(); - return Ok(Event::Finished); - } - - self.input.set_need_data(); - Ok(Event::NeedData) - } - - fn process(&mut self) -> Result<()> { - if let Some((meta, mut read_data)) = self.deserializing_meta.take() { - match meta { - WindowPartitionMeta::Spilled(_) => unreachable!(), - WindowPartitionMeta::Spilling(_) => unreachable!(), - WindowPartitionMeta::BucketSpilled(_) => unreachable!(), - WindowPartitionMeta::Payload(_) => unreachable!(), - WindowPartitionMeta::Partitioned { bucket, data } => { - let mut new_data = Vec::with_capacity(data.len()); - - for meta in data { - if matches!(&meta, WindowPartitionMeta::BucketSpilled(_)) { - if let WindowPartitionMeta::BucketSpilled(p) = meta { - let data = read_data.pop_front().unwrap(); - new_data.push(Self::deserialize(p, data)); - } - - continue; - } - - new_data.push(meta); - } - - self.deserialized_meta = - Some(WindowPartitionMeta::create_partitioned(bucket, new_data)); - } - } - } - - Ok(()) - } - - #[async_backtrace::framed] - async fn async_process(&mut self) -> Result<()> { - if let Some(block_meta) = self.reading_meta.take() { - match &block_meta { - WindowPartitionMeta::Spilled(_) => unreachable!(), - WindowPartitionMeta::Spilling(_) => unreachable!(), - WindowPartitionMeta::BucketSpilled(_) => unreachable!(), - WindowPartitionMeta::Payload(_) => unreachable!(), - WindowPartitionMeta::Partitioned { data, .. } => { - let mut total_elapsed = Duration::default(); - let log_interval = 100; - let mut processed_count = 0; - - let mut read_data = Vec::with_capacity(data.len()); - for meta in data { - if let WindowPartitionMeta::BucketSpilled(p) = meta { - let location = p.location.clone(); - let operator = self.operator.clone(); - let data_range = p.data_range.clone(); - let semaphore = self.semaphore.clone(); - read_data.push(databend_common_base::runtime::spawn(async move { - let _guard = semaphore.acquire().await; - let instant = Instant::now(); - let data = operator - .read_with(&location) - .range(data_range) - .await? - .to_vec(); - - // perf - { - Profile::record_usize_profile( - ProfileStatisticsName::SpillReadCount, - 1, - ); - Profile::record_usize_profile( - ProfileStatisticsName::SpillReadBytes, - data.len(), - ); - Profile::record_usize_profile( - ProfileStatisticsName::SpillReadTime, - instant.elapsed().as_millis() as usize, - ); - } - - total_elapsed += instant.elapsed(); - processed_count += 1; - - // log the progress - if processed_count % log_interval == 0 { - info!( - "Read window partition {}/{} spilled buckets, elapsed: {:?}", - processed_count, - data.len(), - total_elapsed - ); - } - - Ok(data) - })); - } - } - - match futures::future::try_join_all(read_data).await { - Err(_) => { - return Err(ErrorCode::TokioError("Cannot join tokio job")); - } - Ok(read_data) => { - let read_data: std::result::Result>, opendal::Error> = - read_data.into_iter().try_collect(); - - self.deserializing_meta = Some((block_meta, read_data?)); - } - }; - - if processed_count > 0 { - info!( - "Read {} window partition spills successfully, total elapsed: {:?}", - processed_count, total_elapsed - ); - } - } - } - } - - Ok(()) - } -} - -impl TransformWindowPartitionSpillReader { - pub fn create( - input: Arc, - output: Arc, - operator: Operator, - semaphore: Arc, - ) -> Result { - Ok(ProcessorPtr::create(Box::new( - TransformWindowPartitionSpillReader { - input, - output, - operator, - semaphore, - deserialized_meta: None, - reading_meta: None, - deserializing_meta: None, - }, - ))) - } - - fn deserialize(payload: BucketSpilledWindowPayload, data: Vec) -> WindowPartitionMeta { - let mut begin = 0; - let mut columns = Vec::with_capacity(payload.columns_layout.len()); - - for column_layout in payload.columns_layout { - columns.push(deserialize_column(&data[begin..begin + column_layout as usize]).unwrap()); - begin += column_layout as usize; - } - - WindowPartitionMeta::Payload(WindowPayload { - bucket: payload.bucket, - data: DataBlock::new_from_columns(columns), - }) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs deleted file mode 100644 index 2eb678da77f8..000000000000 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs +++ /dev/null @@ -1,272 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; -use std::time::Instant; - -use databend_common_base::base::GlobalUniqName; -use databend_common_base::base::ProgressValues; -use databend_common_base::runtime::profile::Profile; -use databend_common_base::runtime::profile::ProfileStatisticsName; -use databend_common_catalog::table_context::TableContext; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::arrow::serialize_column; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use futures_util::future::BoxFuture; -use log::info; -use opendal::Operator; - -use super::convert_to_partitions; -use super::BucketSpilledWindowPayload; -use super::WindowPartitionMeta; -use crate::pipelines::processors::transforms::window::partition_by::SpillingWindowPayloads; -use crate::pipelines::processors::transforms::window::partition_by::PARTITION_COUNT; -use crate::sessions::QueryContext; - -pub struct TransformWindowPartitionSpillWriter { - ctx: Arc, - input: Arc, - output: Arc, - - operator: Operator, - location_prefix: String, - spilled_block: Option, - spilling_meta: Option, - spilling_future: Option>>, -} - -impl TransformWindowPartitionSpillWriter { - pub fn create( - ctx: Arc, - input: Arc, - output: Arc, - operator: Operator, - location_prefix: String, - ) -> Box { - Box::new(TransformWindowPartitionSpillWriter { - ctx, - input, - output, - operator, - location_prefix, - spilled_block: None, - spilling_meta: None, - spilling_future: None, - }) - } -} - -#[async_trait::async_trait] -impl Processor for TransformWindowPartitionSpillWriter { - fn name(&self) -> String { - String::from("TransformWindowPartitionSpillWriter") - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if self.output.is_finished() { - self.input.finish(); - return Ok(Event::Finished); - } - - if !self.output.can_push() { - self.input.set_not_need_data(); - return Ok(Event::NeedConsume); - } - - if self.spilling_future.is_some() { - self.input.set_not_need_data(); - return Ok(Event::Async); - } - - if let Some(spilled_block) = self.spilled_block.take() { - if !spilled_block.is_empty() || spilled_block.get_meta().is_some() { - self.output.push_data(Ok(spilled_block)); - return Ok(Event::NeedConsume); - } - } - - if self.spilling_meta.is_some() { - self.input.set_not_need_data(); - return Ok(Event::Sync); - } - - if self.input.has_data() { - let mut data_block = self.input.pull_data().unwrap()?; - - if let Some(block_meta) = data_block - .get_meta() - .and_then(WindowPartitionMeta::downcast_ref_from) - { - if matches!(block_meta, WindowPartitionMeta::Spilling(_)) { - self.input.set_not_need_data(); - let block_meta = data_block.take_meta().unwrap(); - self.spilling_meta = WindowPartitionMeta::downcast_from(block_meta); - return Ok(Event::Sync); - } - } - - self.output.push_data(Ok(data_block)); - return Ok(Event::NeedConsume); - } - - if self.input.is_finished() { - self.output.finish(); - return Ok(Event::Finished); - } - - self.input.set_need_data(); - Ok(Event::NeedData) - } - - fn process(&mut self) -> Result<()> { - if let Some(spilling_meta) = self.spilling_meta.take() { - match spilling_meta { - WindowPartitionMeta::Spilling(payload) => { - self.spilling_future = Some(spilling_window_payload( - self.ctx.clone(), - self.operator.clone(), - &self.location_prefix, - payload, - )?); - - return Ok(()); - } - - _ => { - return Err(ErrorCode::Internal( - "TransformWindowPartitionSpillWriter only recv WindowPartitionMeta", - )); - } - } - } - - Ok(()) - } - - #[async_backtrace::framed] - async fn async_process(&mut self) -> Result<()> { - if let Some(spilling_future) = self.spilling_future.take() { - self.spilled_block = Some(spilling_future.await?); - } - - Ok(()) - } -} - -pub fn spilling_window_payload( - ctx: Arc, - operator: Operator, - location_prefix: &str, - payload: SpillingWindowPayloads, -) -> Result>> { - let partitions = convert_to_partitions(payload.data)?; - - let unique_name = GlobalUniqName::unique(); - let location = format!("{}/{}", location_prefix, unique_name); - - let mut write_size = 0; - let mut write_data = Vec::with_capacity(PARTITION_COUNT); - let mut spilled_buckets_payloads = Vec::with_capacity(PARTITION_COUNT); - let mut rows = 0; - - for (bucket, block) in partitions.into_iter() { - if block.is_empty() { - continue; - } - - rows += block.num_rows(); - - let begin = write_size; - let columns = block.columns().to_vec(); - let mut columns_data = Vec::with_capacity(columns.len()); - let mut columns_layout = Vec::with_capacity(columns.len()); - for column in columns.into_iter() { - let column = column - .value - .convert_to_full_column(&column.data_type, block.num_rows()); - let column_data = serialize_column(&column); - write_size += column_data.len() as u64; - columns_layout.push(column_data.len() as u64); - columns_data.push(column_data); - } - - write_data.push(columns_data); - spilled_buckets_payloads.push(BucketSpilledWindowPayload { - bucket, - location: location.clone(), - data_range: begin..write_size, - columns_layout, - }); - } - - Ok(Box::pin(async move { - let instant = Instant::now(); - - let mut write_bytes = 0; - if !write_data.is_empty() { - let mut writer = operator - .writer_with(&location) - .chunk(8 * 1024 * 1024) - .await?; - for write_bucket_data in write_data.into_iter() { - for data in write_bucket_data.into_iter() { - write_bytes += data.len(); - writer.write(data).await?; - } - } - - writer.close().await?; - } - - // perf - { - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); - Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, - instant.elapsed().as_millis() as usize, - ); - } - - { - let progress_val = ProgressValues { - rows, - bytes: write_bytes, - }; - ctx.get_window_partition_spill_progress() - .incr(&progress_val); - } - - info!( - "Write window partition spill {} successfully, elapsed: {:?}", - location, - instant.elapsed() - ); - - Ok(DataBlock::empty_with_meta( - WindowPartitionMeta::create_spilled(spilled_buckets_payloads), - )) - })) -} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs deleted file mode 100644 index 8dfe0d9e47a2..000000000000 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::BTreeMap; -use std::fmt::Debug; -use std::fmt::Formatter; -use std::ops::Range; - -use databend_common_expression::BlockMetaInfo; -use databend_common_expression::BlockMetaInfoPtr; -use databend_common_expression::DataBlock; - -pub struct WindowPayload { - pub bucket: isize, - pub data: DataBlock, -} - -pub struct SpillingWindowPayloads { - pub data: BTreeMap>, -} - -pub struct BucketSpilledWindowPayload { - pub bucket: isize, - pub location: String, - pub data_range: Range, - pub columns_layout: Vec, -} - -pub enum WindowPartitionMeta { - Spilling(SpillingWindowPayloads), - Spilled(Vec), - BucketSpilled(BucketSpilledWindowPayload), - Payload(WindowPayload), - - Partitioned { bucket: isize, data: Vec }, -} - -impl WindowPartitionMeta { - pub fn create_payload(bucket: isize, data: DataBlock) -> BlockMetaInfoPtr { - Box::new(WindowPartitionMeta::Payload(WindowPayload { bucket, data })) - } - - pub fn create_spilling(data: BTreeMap>) -> BlockMetaInfoPtr { - Box::new(WindowPartitionMeta::Spilling(SpillingWindowPayloads { - data, - })) - } - - pub fn create_spilled(buckets_payload: Vec) -> BlockMetaInfoPtr { - Box::new(WindowPartitionMeta::Spilled(buckets_payload)) - } - - pub fn create_bucket_spilled(payload: BucketSpilledWindowPayload) -> BlockMetaInfoPtr { - Box::new(WindowPartitionMeta::BucketSpilled(payload)) - } - - pub fn create_partitioned(bucket: isize, data: Vec) -> BlockMetaInfoPtr { - Box::new(WindowPartitionMeta::Partitioned { bucket, data }) - } -} - -impl serde::Serialize for WindowPartitionMeta { - fn serialize(&self, _: S) -> Result - where S: serde::Serializer { - unreachable!("WindowPartitionMeta does not support exchanging between multiple nodes") - } -} - -impl<'de> serde::Deserialize<'de> for WindowPartitionMeta { - fn deserialize(_: D) -> Result - where D: serde::Deserializer<'de> { - unreachable!("WindowPartitionMeta does not support exchanging between multiple nodes") - } -} - -impl Debug for WindowPartitionMeta { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - match self { - WindowPartitionMeta::Payload(_) => { - f.debug_struct("WindowPartitionMeta::Bucket").finish() - } - WindowPartitionMeta::Spilling(_) => { - f.debug_struct("WindowPartitionMeta::Spilling").finish() - } - WindowPartitionMeta::Spilled(_) => { - f.debug_struct("WindowPartitionMeta::Spilled").finish() - } - WindowPartitionMeta::BucketSpilled(_) => f - .debug_struct("WindowPartitionMeta::BucketSpilled") - .finish(), - WindowPartitionMeta::Partitioned { .. } => { - f.debug_struct("WindowPartitionMeta::Partitioned").finish() - } - } - } -} - -impl BlockMetaInfo for WindowPartitionMeta { - fn typetag_deserialize(&self) { - unimplemented!("WindowPartitionMeta does not support exchanging between multiple nodes") - } - - fn typetag_name(&self) -> &'static str { - unimplemented!("WindowPartitionMeta does not support exchanging between multiple nodes") - } - - fn equals(&self, _: &Box) -> bool { - unimplemented!("Unimplemented equals for WindowPartitionMeta") - } - - fn clone_self(&self) -> Box { - unimplemented!("Unimplemented clone for WindowPartitionMeta") - } -} diff --git a/src/query/service/src/spillers/mod.rs b/src/query/service/src/spillers/mod.rs index aa3b57e83ddd..b771bce717f7 100644 --- a/src/query/service/src/spillers/mod.rs +++ b/src/query/service/src/spillers/mod.rs @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod partition_buffer; mod spiller; -mod spiller_buffer; +pub use partition_buffer::PartitionBuffer; +pub use partition_buffer::PartitionBufferFetchOption; +pub use spiller::SpilledData; pub use spiller::Spiller; pub use spiller::SpillerConfig; pub use spiller::SpillerType; -pub use spiller_buffer::SpillBuffer; diff --git a/src/query/service/src/spillers/partition_buffer.rs b/src/query/service/src/spillers/partition_buffer.rs new file mode 100644 index 000000000000..dfbf02692f9e --- /dev/null +++ b/src/query/service/src/spillers/partition_buffer.rs @@ -0,0 +1,128 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::Result; +use databend_common_expression::DataBlock; + +pub enum PartitionBufferFetchOption { + /// Read all data from the partition. + ReadPartition, + /// Pick data from the partition when the available data reaches or exceeds a certain threshold. + PickPartitionWithThreshold(usize), + /// Pick one data block from the partition. + PickOneDataBlock, + /// Pick at least `usize` bytes from the partition. + PickPartitionBytes(usize), +} + +// The PartitionBuffer is used to buffer partitioned data blocks in the HashJoin and Window operator. +#[derive(Clone)] +pub struct PartitionBuffer { + memory_size: usize, + partition_data: Vec>, + partition_size: Vec, +} + +impl PartitionBuffer { + pub fn create(num_partitions: usize) -> Self { + PartitionBuffer { + memory_size: 0, + partition_data: vec![Vec::new(); num_partitions], + partition_size: vec![0; num_partitions], + } + } + + // Add a partitiond data block to the PartitionBuffer. + pub fn add_data_block(&mut self, partition_id: usize, data_block: DataBlock) { + let data_size = data_block.memory_size(); + self.memory_size += data_size; + self.partition_size[partition_id] += data_size; + self.partition_data[partition_id].push(data_block); + } + + // Fetch data blocks from the PartitionBuffer with the specified option. + pub fn fetch_data_blocks( + &mut self, + partition_id: usize, + option: &PartitionBufferFetchOption, + ) -> Result>> { + let data_blocks = match option { + PartitionBufferFetchOption::ReadPartition => { + if !self.partition_data[partition_id].is_empty() { + Some(self.partition_data[partition_id].clone()) + } else { + None + } + } + PartitionBufferFetchOption::PickPartitionWithThreshold(threshold) => { + if self.partition_size[partition_id] >= *threshold { + let data_blocks = std::mem::take(&mut self.partition_data[partition_id]); + self.memory_size -= self.partition_size[partition_id]; + self.partition_size[partition_id] = 0; + Some(data_blocks) + } else { + None + } + } + PartitionBufferFetchOption::PickOneDataBlock => { + if let Some(data_block) = self.partition_data[partition_id].pop() { + let memory_size = data_block.memory_size(); + self.memory_size -= memory_size; + self.partition_size[partition_id] -= memory_size; + Some(vec![data_block]) + } else { + None + } + } + PartitionBufferFetchOption::PickPartitionBytes(required_bytes) => { + let partition_data = &mut self.partition_data[partition_id]; + let mut accumulated_bytes = 0; + let mut data_blocks = Vec::new(); + while let Some(data_block) = partition_data.pop() { + accumulated_bytes += data_block.memory_size(); + data_blocks.push(data_block); + if accumulated_bytes >= *required_bytes { + break; + } + } + self.memory_size -= accumulated_bytes; + self.partition_size[partition_id] -= accumulated_bytes; + Some(data_blocks) + } + }; + Ok(data_blocks) + } + + pub fn memory_size(&self) -> usize { + self.memory_size + } + + pub fn partition_memory_size(&self, partition_id: usize) -> usize { + self.partition_size[partition_id] + } + + pub fn partition_ids(&self) -> Vec { + let mut partition_ids = vec![]; + for (partition_id, data) in self.partition_data.iter().enumerate() { + if !data.is_empty() { + partition_ids.push(partition_id); + } + } + partition_ids + } + + pub fn is_partition_empty(&self, partition_id: usize) -> bool { + self.partition_data[partition_id].is_empty() + } +} diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 164e6a81820c..54c4bd58129d 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -16,6 +16,7 @@ use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Display; use std::fmt::Formatter; +use std::ops::Range; use std::sync::Arc; use std::time::Instant; @@ -37,6 +38,7 @@ use crate::sessions::QueryContext; pub enum SpillerType { HashJoinBuild, HashJoinProbe, + Window, OrderBy, // Todo: Add more spillers type // Aggregation @@ -47,6 +49,7 @@ impl Display for SpillerType { match self { SpillerType::HashJoinBuild => write!(f, "HashJoinBuild"), SpillerType::HashJoinProbe => write!(f, "HashJoinProbe"), + SpillerType::Window => write!(f, "Window"), SpillerType::OrderBy => write!(f, "OrderBy"), } } @@ -78,11 +81,11 @@ pub struct Spiller { _spiller_type: SpillerType, pub join_spilling_partition_bits: usize, /// 1 partition -> N partition files - pub partition_location: HashMap>, + pub partition_location: HashMap>, /// Record columns layout for spilled data, will be used when read data from disk - pub columns_layout: HashMap>, + pub columns_layout: HashMap>, /// Record how many bytes have been spilled for each partition. - pub partition_spilled_bytes: HashMap, + pub partition_spilled_bytes: HashMap, } impl Spiller { @@ -106,99 +109,65 @@ impl Spiller { }) } - pub fn spilled_partitions(&self) -> HashSet { + pub fn spilled_partitions(&self) -> HashSet { self.partition_location.keys().copied().collect() } - /// Read a certain file to a [`DataBlock`]. - /// We should guarantee that the file is managed by this spiller. - pub async fn read_spilled_file(&self, file: &str) -> Result { - debug_assert!(self.columns_layout.contains_key(file)); - let data = self.operator.read(file).await?.to_bytes(); - let bytes = data.len(); - - let mut begin = 0; - let instant = Instant::now(); - let mut columns = Vec::with_capacity(self.columns_layout.len()); - let columns_layout = self.columns_layout.get(file).unwrap(); - for column_layout in columns_layout.iter() { - columns.push(deserialize_column(&data[begin..begin + column_layout]).unwrap()); - begin += column_layout; - } - let block = DataBlock::new_from_columns(columns); - - Profile::record_usize_profile(ProfileStatisticsName::SpillReadCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillReadBytes, bytes); - Profile::record_usize_profile( - ProfileStatisticsName::SpillReadTime, - instant.elapsed().as_millis() as usize, - ); - - Ok(block) - } + /// Spill a [`DataBlock`] to storage. + pub async fn spill(&mut self, data_block: DataBlock) -> Result { + // Serialize data block. + let (data_size, columns_data, columns_layout) = self.serialize_data_block(data_block)?; - /// Write a [`DataBlock`] to storage. - pub async fn spill_block(&mut self, data: DataBlock) -> Result { + // Spill data to storage. let instant = Instant::now(); let unique_name = GlobalUniqName::unique(); let location = format!("{}/{}", self.config.location_prefix, unique_name); - let mut write_bytes = 0; - let mut writer = self .operator .writer_with(&location) .chunk(8 * 1024 * 1024) .await?; - let columns = data.columns().to_vec(); - let mut columns_data = Vec::with_capacity(columns.len()); - for column in columns.into_iter() { - let column = column - .value - .convert_to_full_column(&column.data_type, data.num_rows()); - let column_data = serialize_column(&column); - self.columns_layout - .entry(location.to_string()) - .and_modify(|layouts| { - layouts.push(column_data.len()); - }) - .or_insert(vec![column_data.len()]); - write_bytes += column_data.len(); - columns_data.push(column_data); - } - for data in columns_data.into_iter() { writer.write(data).await?; } writer.close().await?; + // Record statistics. Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); + Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, data_size as usize); Profile::record_usize_profile( ProfileStatisticsName::SpillWriteTime, instant.elapsed().as_millis() as usize, ); + // Record columns layout for spilled data. + self.columns_layout.insert(location.clone(), columns_layout); + Ok(location) } #[async_backtrace::framed] - /// Spill data block with location - pub async fn spill_with_partition(&mut self, p_id: u8, data: DataBlock) -> Result<()> { + /// Spill data block with partition + pub async fn spill_with_partition( + &mut self, + partition_id: usize, + data: DataBlock, + ) -> Result<()> { let progress_val = ProgressValues { rows: data.num_rows(), bytes: data.memory_size(), }; self.partition_spilled_bytes - .entry(p_id) + .entry(partition_id) .and_modify(|bytes| { *bytes += data.memory_size() as u64; }) .or_insert(data.memory_size() as u64); - let location = self.spill_block(data).await?; + let location = self.spill(data).await?; self.partition_location - .entry(p_id) + .entry(partition_id) .and_modify(|locs| { locs.push(location.clone()); }) @@ -208,9 +177,89 @@ impl Spiller { Ok(()) } + pub async fn spill_with_merged_partitions( + &mut self, + partitioned_data: Vec<(usize, DataBlock)>, + ) -> Result { + // Serialize data block. + let mut write_bytes = 0; + let mut write_data = Vec::with_capacity(partitioned_data.len()); + let mut spilled_partitions = Vec::with_capacity(partitioned_data.len()); + for (partition_id, data_block) in partitioned_data.into_iter() { + let begin = write_bytes; + let (data_size, columns_data, columns_layout) = + self.serialize_data_block(data_block)?; + + write_bytes += data_size; + write_data.push(columns_data); + spilled_partitions.push((partition_id, begin..write_bytes, columns_layout)); + } + + // Spill data to storage. + let instant = Instant::now(); + let unique_name = GlobalUniqName::unique(); + let location = format!("{}/{}", self.config.location_prefix, unique_name); + let mut writer = self + .operator + .writer_with(&location) + .chunk(8 * 1024 * 1024) + .await?; + for write_bucket_data in write_data.into_iter() { + for data in write_bucket_data.into_iter() { + writer.write(data).await?; + } + } + writer.close().await?; + + // Record statistics. + Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes as usize); + Profile::record_usize_profile( + ProfileStatisticsName::SpillWriteTime, + instant.elapsed().as_millis() as usize, + ); + + Ok(SpilledData::MergedPartition { + location, + partitions: spilled_partitions, + }) + } + + /// Read a certain file to a [`DataBlock`]. + /// We should guarantee that the file is managed by this spiller. + pub async fn read_spilled_file(&self, file: &str) -> Result { + debug_assert!(self.columns_layout.contains_key(file)); + + // Read spilled data from storage. + let instant = Instant::now(); + let data = self.operator.read(file).await?.to_bytes(); + + // Record statistics. + Profile::record_usize_profile(ProfileStatisticsName::SpillReadCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::SpillReadBytes, data.len()); + Profile::record_usize_profile( + ProfileStatisticsName::SpillReadTime, + instant.elapsed().as_millis() as usize, + ); + + // Deserialize data block. + let mut begin = 0; + let mut columns = Vec::with_capacity(self.columns_layout.len()); + let columns_layout = self.columns_layout.get(file).unwrap(); + for column_layout in columns_layout.iter() { + columns.push( + deserialize_column(&data[begin as usize..(begin + column_layout) as usize]) + .unwrap(), + ); + begin += column_layout; + } + + Ok(DataBlock::new_from_columns(columns)) + } + #[async_backtrace::framed] /// Read spilled data with partition id - pub async fn read_spilled_partition(&mut self, p_id: &u8) -> Result> { + pub async fn read_spilled_partition(&mut self, p_id: &usize) -> Result> { if let Some(files) = self.partition_location.get(p_id) { let mut spilled_data = Vec::with_capacity(files.len()); for file in files.iter() { @@ -225,7 +274,112 @@ impl Spiller { } } + pub async fn read_merged_partitions( + &self, + merged_partitions: &SpilledData, + ) -> Result> { + if let SpilledData::MergedPartition { + location, + partitions, + } = merged_partitions + { + // Read spilled data from storage. + let instant = Instant::now(); + let data = self.operator.read(location).await?.to_bytes(); + + // Record statistics. + Profile::record_usize_profile(ProfileStatisticsName::SpillReadCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::SpillReadBytes, data.len()); + Profile::record_usize_profile( + ProfileStatisticsName::SpillReadTime, + instant.elapsed().as_millis() as usize, + ); + + // Deserialize partitioned data block. + let mut partitioned_data = Vec::with_capacity(partitions.len()); + for (partition_id, range, columns_layout) in partitions.iter() { + let mut begin = range.start; + let mut columns = Vec::with_capacity(columns_layout.len()); + for column_layout in columns_layout.iter() { + columns.push( + deserialize_column(&data[begin as usize..(begin + column_layout) as usize]) + .unwrap(), + ); + begin += column_layout; + } + partitioned_data.push((*partition_id, DataBlock::new_from_columns(columns))); + } + return Ok(partitioned_data); + } + Ok(vec![]) + } + + pub async fn read_range( + &self, + location: &str, + data_range: Range, + columns_layout: &[u64], + ) -> Result { + // Read spilled data from storage. + let instant = Instant::now(); + let data = self + .operator + .read_with(location) + .range(data_range) + .await? + .to_vec(); + + // Record statistics. + Profile::record_usize_profile(ProfileStatisticsName::SpillReadCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::SpillReadBytes, data.len()); + Profile::record_usize_profile( + ProfileStatisticsName::SpillReadTime, + instant.elapsed().as_millis() as usize, + ); + + // Deserialize data block. + let mut begin = 0; + let mut columns = Vec::with_capacity(columns_layout.len()); + for column_layout in columns_layout.iter() { + columns.push( + deserialize_column(&data[begin as usize..(begin + column_layout) as usize]) + .unwrap(), + ); + begin += column_layout; + } + + Ok(DataBlock::new_from_columns(columns)) + } + pub(crate) fn spilled_files(&self) -> Vec { self.columns_layout.keys().cloned().collect() } + + // Serialize data block to (data_size, columns_data, columns_layout). + fn serialize_data_block(&self, data_block: DataBlock) -> Result<(u64, Vec>, Vec)> { + let num_columns = data_block.num_columns(); + let mut data_size = 0; + let mut columns_data = Vec::with_capacity(num_columns); + let mut columns_layout = Vec::with_capacity(num_columns); + + for column in data_block.columns() { + let column = column + .value + .convert_to_full_column(&column.data_type, data_block.num_rows()); + let column_data = serialize_column(&column); + + data_size += column_data.len() as u64; + columns_layout.push(column_data.len() as u64); + columns_data.push(column_data); + } + Ok((data_size, columns_data, columns_layout)) + } +} + +pub enum SpilledData { + Partition(String), + MergedPartition { + location: String, + partitions: Vec<(usize, Range, Vec)>, + }, } diff --git a/src/query/service/src/spillers/spiller_buffer.rs b/src/query/service/src/spillers/spiller_buffer.rs deleted file mode 100644 index c08a79bb410c..000000000000 --- a/src/query/service/src/spillers/spiller_buffer.rs +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use databend_common_exception::Result; -use databend_common_expression::DataBlock; - -// The spiller buffer will record each partition's unspilled data. -#[derive(Clone)] -pub struct SpillBuffer { - partition_data: Vec>, - partition_size: Vec, - partition_threshold: usize, -} - -impl SpillBuffer { - pub fn create(num_partitions: usize, buffer_threshold: usize) -> Self { - // The threshold of each partition, we will spill the partition data if the - // size exceeds the threshold. - let partition_threshold = - (buffer_threshold as f64 / num_partitions as f64 * 1024.0 * 1024.0) as usize; - SpillBuffer { - partition_data: vec![Vec::new(); num_partitions], - partition_size: vec![0; num_partitions], - partition_threshold, - } - } - - // Add a partition's unspilled data to the SpillBuffer. - pub fn add_partition_data(&mut self, partition_id: usize, data_block: DataBlock) { - let data_size = data_block.memory_size(); - self.partition_size[partition_id] += data_size; - self.partition_data[partition_id].push(data_block); - } - - // Check if the partition buffer is full, we will return the partition data. - pub fn pick_data_to_spill(&mut self, partition_id: usize) -> Result> { - if self.partition_size[partition_id] >= self.partition_threshold { - let data_blocks = self.partition_data[partition_id].clone(); - self.partition_data[partition_id].clear(); - self.partition_size[partition_id] = 0; - let data_block = DataBlock::concat(&data_blocks)?; - Ok(Some(data_block)) - } else { - Ok(None) - } - } - - // Get partition data by partition id. - pub fn read_partition_data(&mut self, partition_id: u8, pick: bool) -> Option> { - if !self.partition_data[partition_id as usize].is_empty() { - let data_blocks = if pick { - let data_blocks = self.partition_data[partition_id as usize].clone(); - self.partition_data[partition_id as usize].clear(); - data_blocks - } else { - self.partition_data[partition_id as usize].clone() - }; - Some(data_blocks) - } else { - None - } - } - - pub fn buffered_partitions(&self) -> Vec { - let mut partition_ids = vec![]; - for (partition_id, data) in self.partition_data.iter().enumerate() { - if !data.is_empty() { - partition_ids.push(partition_id as u8); - } - } - partition_ids - } - - pub fn empty_partition(&self, partition_id: u8) -> bool { - self.partition_data[partition_id as usize].is_empty() - } -} diff --git a/src/query/service/tests/it/pipelines/executor/executor_graph.rs b/src/query/service/tests/it/pipelines/executor/executor_graph.rs index e46e883d2a66..2efe0cfbcf08 100644 --- a/src/query/service/tests/it/pipelines/executor/executor_graph.rs +++ b/src/query/service/tests/it/pipelines/executor/executor_graph.rs @@ -51,8 +51,8 @@ async fn test_create_simple_pipeline() -> Result<()> { \n 0 [ label = \"BlocksSource\" ]\ \n 1 [ label = \"DummyTransform\" ]\ \n 2 [ label = \"SyncSenderSink\" ]\ - \n 0 -> 1 [ ]\ - \n 1 -> 2 [ ]\ + \n 0 -> 1 [ 0 -> 0]\ + \n 1 -> 2 [ 0 -> 0]\ \n}\n" ); @@ -73,10 +73,10 @@ async fn test_create_parallel_simple_pipeline() -> Result<()> { \n 3 [ label = \"DummyTransform\" ]\ \n 4 [ label = \"SyncSenderSink\" ]\ \n 5 [ label = \"SyncSenderSink\" ]\ - \n 0 -> 2 [ ]\ - \n 1 -> 3 [ ]\ - \n 2 -> 4 [ ]\ - \n 3 -> 5 [ ]\ + \n 0 -> 2 [ 0 -> 0]\ + \n 1 -> 3 [ 0 -> 0]\ + \n 2 -> 4 [ 0 -> 0]\ + \n 3 -> 5 [ 0 -> 0]\ \n}\n" ); @@ -100,15 +100,15 @@ async fn test_create_resize_pipeline() -> Result<()> { \n 6 [ label = \"Resize\" ]\ \n 7 [ label = \"SyncSenderSink\" ]\ \n 8 [ label = \"SyncSenderSink\" ]\ - \n 0 -> 1 [ ]\ - \n 1 -> 2 [ ]\ - \n 1 -> 3 [ ]\ - \n 2 -> 4 [ ]\ - \n 3 -> 4 [ ]\ - \n 4 -> 5 [ ]\ - \n 5 -> 6 [ ]\ - \n 6 -> 7 [ ]\ - \n 6 -> 8 [ ]\ + \n 0 -> 1 [ 0 -> 0]\ + \n 1 -> 2 [ 0 -> 0]\ + \n 1 -> 3 [ 1 -> 0]\ + \n 2 -> 4 [ 0 -> 0]\ + \n 3 -> 4 [ 0 -> 1]\ + \n 4 -> 5 [ 0 -> 0]\ + \n 5 -> 6 [ 0 -> 0]\ + \n 6 -> 7 [ 0 -> 0]\ + \n 6 -> 8 [ 1 -> 0]\ \n}\n" ); diff --git a/src/query/service/tests/it/spillers/spiller.rs b/src/query/service/tests/it/spillers/spiller.rs index afedd9fe0631..387ccd5a4062 100644 --- a/src/query/service/tests/it/spillers/spiller.rs +++ b/src/query/service/tests/it/spillers/spiller.rs @@ -47,13 +47,13 @@ async fn test_spill_with_partition() -> Result<()> { Int32Type::from_data((1..101).collect::>()), ]); - let res = spiller.spill_with_partition(0_u8, data).await; + let res = spiller.spill_with_partition(0, data).await; assert!(res.is_ok()); assert!(spiller.partition_location.get(&0).unwrap()[0].starts_with("_query_spill")); // Test read spilled data - let block = DataBlock::concat(&spiller.read_spilled_partition(&(0_u8)).await?)?; + let block = DataBlock::concat(&spiller.read_spilled_partition(&(0)).await?)?; assert_eq!(block.num_rows(), 100); assert_eq!(block.num_columns(), 2); for (col_idx, col) in block.columns().iter().enumerate() { diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 26944906f790..deee0343df8c 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -460,7 +460,24 @@ impl DefaultSettings { mode: SettingMode::Both, range: Some(SettingRange::Numeric(0..=100)), }), - + ("window_num_partitions", DefaultSettingValue { + value: UserSettingValue::UInt64(256), + desc: "Sets the number of partitions for window operator.", + mode: SettingMode::Both, + range: Some(SettingRange::Numeric(0..=u64::MAX)), + }), + ("window_spill_unit_size_mb", DefaultSettingValue { + value: UserSettingValue::UInt64(256), + desc: "Sets the spill unit size (MB) for window operator.", + mode: SettingMode::Both, + range: Some(SettingRange::Numeric(0..=u64::MAX)), + }), + ("window_partition_sort_block_size", DefaultSettingValue { + value: UserSettingValue::UInt64(65536), + desc: "Sets the block size of data blocks to be sorted in window partition.", + mode: SettingMode::Both, + range: Some(SettingRange::Numeric(0..=u64::MAX)), + }), ("sort_spilling_bytes_threshold_per_proc", DefaultSettingValue { value: UserSettingValue::UInt64(0), desc: "Sets the maximum amount of memory in bytes that a sorter can use before spilling data to storage during query execution.", diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index 7a71ede7e73a..e3313d7e7353 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -399,6 +399,18 @@ impl Settings { Ok(self.try_get_u64("window_partition_spilling_memory_ratio")? as usize) } + pub fn get_window_num_partitions(&self) -> Result { + Ok(self.try_get_u64("window_num_partitions")? as usize) + } + + pub fn get_window_spill_unit_size_mb(&self) -> Result { + Ok(self.try_get_u64("window_spill_unit_size_mb")? as usize) + } + + pub fn get_window_partition_sort_block_size(&self) -> Result { + self.try_get_u64("window_partition_sort_block_size") + } + pub fn get_sort_spilling_bytes_threshold_per_proc(&self) -> Result { Ok(self.try_get_u64("sort_spilling_bytes_threshold_per_proc")? as usize) } diff --git a/src/query/sql/src/executor/format.rs b/src/query/sql/src/executor/format.rs index b0dda598d991..3ece5cf2e064 100644 --- a/src/query/sql/src/executor/format.rs +++ b/src/query/sql/src/executor/format.rs @@ -1089,6 +1089,10 @@ fn aggregate_partial_to_format_tree( children.extend(items); } + if let Some((_, r)) = &plan.rank_limit { + children.push(FormatTreeNode::new(format!("rank limit: {r}"))); + } + append_profile_info(&mut children, profs, plan.plan_id); children.push(to_format_tree(&plan.input, metadata, profs)?); @@ -1130,11 +1134,6 @@ fn aggregate_final_to_format_tree( FormatTreeNode::new(format!("aggregate functions: [{agg_funcs}]")), ]; - if let Some(limit) = &plan.limit { - let items = FormatTreeNode::new(format!("limit: {limit}")); - children.push(items); - } - if let Some(info) = &plan.stat_info { let items = plan_stats_info_to_format_tree(info); children.extend(items); diff --git a/src/query/sql/src/executor/mod.rs b/src/query/sql/src/executor/mod.rs index 4b3712a1bb65..2b1ae9d2fd28 100644 --- a/src/query/sql/src/executor/mod.rs +++ b/src/query/sql/src/executor/mod.rs @@ -16,7 +16,6 @@ mod explain; mod format; mod physical_plan; mod physical_plan_builder; -mod physical_plan_display; mod physical_plan_visitor; pub mod physical_plans; mod util; diff --git a/src/query/sql/src/executor/physical_plan_display.rs b/src/query/sql/src/executor/physical_plan_display.rs deleted file mode 100644 index 953f3aa45725..000000000000 --- a/src/query/sql/src/executor/physical_plan_display.rs +++ /dev/null @@ -1,589 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::fmt::Display; -use std::fmt::Formatter; - -use databend_common_functions::BUILTIN_FUNCTIONS; -use itertools::Itertools; - -use crate::executor::physical_plan::PhysicalPlan; -use crate::executor::physical_plans::AddStreamColumn; -use crate::executor::physical_plans::AggregateExpand; -use crate::executor::physical_plans::AggregateFinal; -use crate::executor::physical_plans::AggregatePartial; -use crate::executor::physical_plans::AsyncFunction; -use crate::executor::physical_plans::CacheScan; -use crate::executor::physical_plans::ColumnMutation; -use crate::executor::physical_plans::CommitSink; -use crate::executor::physical_plans::CompactSource; -use crate::executor::physical_plans::ConstantTableScan; -use crate::executor::physical_plans::CopyIntoLocation; -use crate::executor::physical_plans::CopyIntoTable; -use crate::executor::physical_plans::CteScan; -use crate::executor::physical_plans::DistributedInsertSelect; -use crate::executor::physical_plans::EvalScalar; -use crate::executor::physical_plans::Exchange; -use crate::executor::physical_plans::ExchangeSink; -use crate::executor::physical_plans::ExchangeSource; -use crate::executor::physical_plans::ExpressionScan; -use crate::executor::physical_plans::Filter; -use crate::executor::physical_plans::HashJoin; -use crate::executor::physical_plans::Limit; -use crate::executor::physical_plans::MaterializedCte; -use crate::executor::physical_plans::Mutation; -use crate::executor::physical_plans::MutationManipulate; -use crate::executor::physical_plans::MutationOrganize; -use crate::executor::physical_plans::MutationSource; -use crate::executor::physical_plans::MutationSplit; -use crate::executor::physical_plans::ProjectSet; -use crate::executor::physical_plans::RangeJoin; -use crate::executor::physical_plans::Recluster; -use crate::executor::physical_plans::ReplaceAsyncSourcer; -use crate::executor::physical_plans::ReplaceDeduplicate; -use crate::executor::physical_plans::ReplaceInto; -use crate::executor::physical_plans::RowFetch; -use crate::executor::physical_plans::Sort; -use crate::executor::physical_plans::TableScan; -use crate::executor::physical_plans::Udf; -use crate::executor::physical_plans::UnionAll; -use crate::executor::physical_plans::Window; -use crate::executor::physical_plans::WindowPartition; -use crate::plans::CacheSource; -use crate::plans::JoinType; - -impl PhysicalPlan { - pub fn format_indent(&self, indent: usize) -> impl std::fmt::Display + '_ { - PhysicalPlanIndentFormatDisplay { indent, node: self } - } -} - -pub struct PhysicalPlanIndentFormatDisplay<'a> { - indent: usize, - node: &'a PhysicalPlan, -} - -impl<'a> Display for PhysicalPlanIndentFormatDisplay<'a> { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "{}", " ".repeat(self.indent))?; - - match self.node { - PhysicalPlan::TableScan(scan) => write!(f, "{}", scan)?, - PhysicalPlan::Filter(filter) => write!(f, "{}", filter)?, - PhysicalPlan::EvalScalar(eval_scalar) => write!(f, "{}", eval_scalar)?, - PhysicalPlan::AggregateExpand(aggregate) => write!(f, "{}", aggregate)?, - PhysicalPlan::AggregatePartial(aggregate) => write!(f, "{}", aggregate)?, - PhysicalPlan::AggregateFinal(aggregate) => write!(f, "{}", aggregate)?, - PhysicalPlan::Window(window) => write!(f, "{}", window)?, - PhysicalPlan::WindowPartition(window_partition) => write!(f, "{}", window_partition)?, - PhysicalPlan::Sort(sort) => write!(f, "{}", sort)?, - PhysicalPlan::Limit(limit) => write!(f, "{}", limit)?, - PhysicalPlan::RowFetch(row_fetch) => write!(f, "{}", row_fetch)?, - PhysicalPlan::HashJoin(join) => write!(f, "{}", join)?, - PhysicalPlan::Exchange(exchange) => write!(f, "{}", exchange)?, - PhysicalPlan::ExchangeSource(source) => write!(f, "{}", source)?, - PhysicalPlan::ExchangeSink(sink) => write!(f, "{}", sink)?, - PhysicalPlan::UnionAll(union_all) => write!(f, "{}", union_all)?, - PhysicalPlan::DistributedInsertSelect(insert_select) => write!(f, "{}", insert_select)?, - PhysicalPlan::CompactSource(compact) => write!(f, "{}", compact)?, - PhysicalPlan::CommitSink(commit) => write!(f, "{}", commit)?, - PhysicalPlan::ProjectSet(unnest) => write!(f, "{}", unnest)?, - PhysicalPlan::RangeJoin(plan) => write!(f, "{}", plan)?, - PhysicalPlan::CopyIntoTable(copy_into_table) => write!(f, "{}", copy_into_table)?, - PhysicalPlan::CopyIntoLocation(copy_into_location) => { - write!(f, "{}", copy_into_location)? - } - PhysicalPlan::MutationSource(mutation_source) => write!(f, "{}", mutation_source)?, - PhysicalPlan::ReplaceAsyncSourcer(async_sourcer) => write!(f, "{}", async_sourcer)?, - PhysicalPlan::ReplaceDeduplicate(deduplicate) => write!(f, "{}", deduplicate)?, - PhysicalPlan::ReplaceInto(replace) => write!(f, "{}", replace)?, - PhysicalPlan::ColumnMutation(column_mutation) => write!(f, "{}", column_mutation)?, - PhysicalPlan::Mutation(merge_into) => write!(f, "{}", merge_into)?, - PhysicalPlan::MutationSplit(merge_into_split) => write!(f, "{}", merge_into_split)?, - PhysicalPlan::MutationManipulate(merge_into_manipulate) => { - write!(f, "{}", merge_into_manipulate)? - } - PhysicalPlan::MutationOrganize(merge_into_organize) => { - write!(f, "{}", merge_into_organize)? - } - PhysicalPlan::AddStreamColumn(add_stream_column) => write!(f, "{}", add_stream_column)?, - PhysicalPlan::CteScan(cte_scan) => write!(f, "{}", cte_scan)?, - PhysicalPlan::RecursiveCteScan(recursive_cte_scan) => { - write!(f, "{}", recursive_cte_scan)? - } - PhysicalPlan::MaterializedCte(plan) => write!(f, "{}", plan)?, - PhysicalPlan::ConstantTableScan(scan) => write!(f, "{}", scan)?, - PhysicalPlan::ExpressionScan(scan) => write!(f, "{}", scan)?, - PhysicalPlan::CacheScan(scan) => write!(f, "{}", scan)?, - PhysicalPlan::Recluster(plan) => write!(f, "{}", plan)?, - PhysicalPlan::Udf(udf) => write!(f, "{}", udf)?, - PhysicalPlan::Duplicate(_) => "Duplicate".fmt(f)?, - PhysicalPlan::Shuffle(_) => "Shuffle".fmt(f)?, - PhysicalPlan::ChunkFilter(_) => "ChunkFilter".fmt(f)?, - PhysicalPlan::ChunkEvalScalar(_) => "ChunkEvalScalar".fmt(f)?, - PhysicalPlan::ChunkCastSchema(_) => "ChunkCastSchema".fmt(f)?, - PhysicalPlan::ChunkFillAndReorder(_) => "ChunkFillAndReorder".fmt(f)?, - PhysicalPlan::ChunkAppendData(_) => "ChunkAppendData".fmt(f)?, - PhysicalPlan::ChunkMerge(_) => "ChunkMerge".fmt(f)?, - PhysicalPlan::ChunkCommitInsert(_) => "ChunkCommitInsert".fmt(f)?, - PhysicalPlan::AsyncFunction(_) => "AsyncFunction".fmt(f)?, - } - - for node in self.node.children() { - writeln!(f)?; - write!(f, "{}", node.format_indent(self.indent + 1))?; - } - - Ok(()) - } -} - -impl Display for TableScan { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "TableScan: [{}]", self.source.source_info.desc()) - } -} - -impl Display for CteScan { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "CteScan: [{}]", self.cte_idx.0) - } -} - -impl Display for MaterializedCte { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "MaterializedCte") - } -} - -impl Display for ConstantTableScan { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let columns = self - .values - .iter() - .enumerate() - .map(|(i, value)| { - let column = value.iter().map(|val| format!("{val}")).join(", "); - format!("column {}: [{}]", i, column) - }) - .collect::>(); - - write!(f, "{}: {}", self.name(), columns.join(", ")) - } -} - -impl Display for ExpressionScan { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let columns = self - .values - .iter() - .enumerate() - .map(|(i, value)| { - let column = value - .iter() - .map(|val| val.as_expr(&BUILTIN_FUNCTIONS).sql_display()) - .join(", "); - format!("column {}: [{}]", i, column) - }) - .collect::>(); - - write!(f, "ExpressionScan: {}", columns.join(", ")) - } -} - -impl Display for CacheScan { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match &self.cache_source { - CacheSource::HashJoinBuild((cache_index, column_indexes)) => { - write!( - f, - "CacheScan: [cache_index: {}, column_indexes: {:?}]", - cache_index, column_indexes - ) - } - } - } -} - -impl Display for Filter { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let predicates = self - .predicates - .iter() - .map(|pred| pred.as_expr(&BUILTIN_FUNCTIONS).sql_display()) - .join(", "); - - write!(f, "Filter: [{predicates}]") - } -} - -impl Display for Sort { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let scalars = self - .order_by - .iter() - .map(|item| { - format!( - "{} {}", - item.order_by, - if item.asc { "ASC" } else { "DESC" } - ) - }) - .collect::>(); - let limit = self.limit.as_ref().cloned().unwrap_or(0); - write!(f, "Sort: [{}], Limit: [{}]", scalars.join(", "), limit) - } -} - -impl Display for EvalScalar { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let scalars = self - .exprs - .iter() - .map(|(expr, _)| expr.as_expr(&BUILTIN_FUNCTIONS).to_string()) - .collect::>(); - - write!(f, "EvalScalar: [{}]", scalars.join(", ")) - } -} - -impl Display for AggregateExpand { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let sets = self - .grouping_sets - .sets - .iter() - .map(|set| { - set.iter() - .map(|index| index.to_string()) - .collect::>() - .join(", ") - }) - .map(|s| format!("[{}]", s)) - .collect::>() - .join(", "); - write!(f, "Aggregate(Expand): grouping sets: [{}]", sets) - } -} - -impl Display for AggregateFinal { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let group_items = self - .group_by - .iter() - .map(|v| v.to_string()) - .collect::>() - .join(", "); - - let agg_funcs = self - .agg_funcs - .iter() - .map(|item| { - format!( - "{}({})", - item.sig.name, - item.arg_indices - .iter() - .map(|index| index.to_string()) - .collect::>() - .join(", ") - ) - }) - .join(", "); - - write!( - f, - "Aggregate(Final): group items: [{}], aggregate functions: [{}]", - group_items, agg_funcs - ) - } -} - -impl Display for AggregatePartial { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let group_items = self - .group_by - .iter() - .map(|v| v.to_string()) - .collect::>() - .join(", "); - - let agg_funcs = self - .agg_funcs - .iter() - .map(|item| { - format!( - "{}({})", - item.sig.name, - item.arg_indices - .iter() - .map(|index| index.to_string()) - .collect::>() - .join(", ") - ) - }) - .join(", "); - - write!( - f, - "Aggregate(Partial): group items: [{}], aggregate functions: [{}]", - group_items, agg_funcs - ) - } -} - -impl Display for Window { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let window_id = self.plan_id; - write!(f, "Window: [{}]", window_id) - } -} - -impl Display for WindowPartition { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let window_partition_id = self.plan_id; - write!(f, "WindowPartition: [{}]", window_partition_id) - } -} - -impl Display for Limit { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let limit = self.limit.as_ref().cloned().unwrap_or(0); - write!(f, "Limit: [{}], Offset: [{}]", limit, self.offset) - } -} - -impl Display for RowFetch { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "RowFetch: [{:?}]", self.cols_to_fetch) - } -} - -impl Display for HashJoin { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - match self.join_type { - JoinType::Cross => { - write!(f, "CrossJoin") - } - _ => { - let build_keys = self - .build_keys - .iter() - .map(|scalar| scalar.as_expr(&BUILTIN_FUNCTIONS).sql_display()) - .collect::>() - .join(", "); - - let probe_keys = self - .probe_keys - .iter() - .map(|scalar| scalar.as_expr(&BUILTIN_FUNCTIONS).sql_display()) - .collect::>() - .join(", "); - - let join_filters = self - .non_equi_conditions - .iter() - .map(|scalar| scalar.as_expr(&BUILTIN_FUNCTIONS).sql_display()) - .collect::>() - .join(", "); - - write!( - f, - "HashJoin: {}, build keys: [{}], probe keys: [{}], join filters: [{}]", - &self.join_type, build_keys, probe_keys, join_filters, - ) - } - } - } -} - -impl Display for RangeJoin { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "IEJoin: {}", &self.join_type) - } -} - -impl Display for Exchange { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let keys = self - .keys - .iter() - .map(|key| key.as_expr(&BUILTIN_FUNCTIONS).sql_display()) - .join(", "); - - write!(f, "Exchange: [kind: {:?}, keys: {}]", self.kind, keys) - } -} - -impl Display for ExchangeSource { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!( - f, - "Exchange Source: fragment id: [{:?}]", - self.source_fragment_id - ) - } -} - -impl Display for ExchangeSink { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!( - f, - "Exchange Sink: fragment id: [{:?}]", - self.destination_fragment_id - ) - } -} - -impl Display for UnionAll { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "UnionAll") - } -} - -impl Display for DistributedInsertSelect { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "DistributedInsertSelect") - } -} - -impl Display for CompactSource { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "CompactSource") - } -} - -impl Display for CommitSink { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "CommitSink") - } -} -impl Display for CopyIntoTable { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "CopyIntoTable") - } -} - -impl Display for CopyIntoLocation { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "CopyIntoLocation") - } -} - -impl Display for ProjectSet { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let scalars = self - .srf_exprs - .iter() - .map(|(expr, _)| expr.as_expr(&BUILTIN_FUNCTIONS).to_string()) - .collect::>(); - - write!( - f, - "ProjectSet: set-returning functions : {}", - scalars.join(", ") - ) - } -} - -impl Display for ReplaceAsyncSourcer { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "AsyncSourcer") - } -} - -impl Display for ReplaceDeduplicate { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "Deduplicate") - } -} - -impl Display for ReplaceInto { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "Replace") - } -} - -impl Display for MutationSource { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "MutationSource") - } -} - -impl Display for ColumnMutation { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "ColumnMutation") - } -} - -impl Display for Mutation { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "Mutation") - } -} - -impl Display for MutationSplit { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "MutationSplit") - } -} - -impl Display for MutationManipulate { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "MutationManipulate") - } -} - -impl Display for MutationOrganize { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "MutationOrganize") - } -} - -impl Display for AddStreamColumn { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "AddStreamColumn") - } -} - -impl Display for Recluster { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "Recluster") - } -} - -impl Display for Udf { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let scalars = self - .udf_funcs - .iter() - .map(|func| { - let arg_exprs = func.arg_exprs.join(", "); - format!("{}({})", func.func_name, arg_exprs) - }) - .collect::>(); - write!(f, "Udf functions: {}", scalars.join(", ")) - } -} - -impl Display for AsyncFunction { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let scalars = self - .async_func_descs - .iter() - .map(|func| func.display_name.clone()) - .collect::>(); - write!(f, "Async functions: {}", scalars.join(", ")) - } -} diff --git a/src/query/sql/src/executor/physical_plan_visitor.rs b/src/query/sql/src/executor/physical_plan_visitor.rs index 354c275cb95a..6dcab582c7f3 100644 --- a/src/query/sql/src/executor/physical_plan_visitor.rs +++ b/src/query/sql/src/executor/physical_plan_visitor.rs @@ -200,6 +200,7 @@ pub trait PhysicalPlanReplacer { group_by_display: plan.group_by_display.clone(), agg_funcs: plan.agg_funcs.clone(), stat_info: plan.stat_info.clone(), + rank_limit: plan.rank_limit.clone(), })) } @@ -214,7 +215,6 @@ pub trait PhysicalPlanReplacer { agg_funcs: plan.agg_funcs.clone(), group_by_display: plan.group_by_display.clone(), stat_info: plan.stat_info.clone(), - limit: plan.limit, })) } diff --git a/src/query/sql/src/executor/physical_plans/physical_aggregate_final.rs b/src/query/sql/src/executor/physical_plans/physical_aggregate_final.rs index 0e8d2e5aa891..dc8286f9c4c7 100644 --- a/src/query/sql/src/executor/physical_plans/physical_aggregate_final.rs +++ b/src/query/sql/src/executor/physical_plans/physical_aggregate_final.rs @@ -22,6 +22,7 @@ use databend_common_expression::DataSchemaRef; use databend_common_expression::DataSchemaRefExt; use databend_common_expression::RemoteExpr; +use super::SortDesc; use crate::executor::explain::PlanStatsInfo; use crate::executor::physical_plans::AggregateExpand; use crate::executor::physical_plans::AggregateFunctionDesc; @@ -45,8 +46,6 @@ pub struct AggregateFinal { pub group_by: Vec, pub agg_funcs: Vec, pub before_group_by_schema: DataSchemaRef, - pub limit: Option, - pub group_by_display: Vec, // Only used for explain @@ -105,7 +104,7 @@ impl PhysicalPlanBuilder { aggregate_functions: used, from_distinct: agg.from_distinct, mode: agg.mode, - limit: agg.limit, + rank_limit: agg.rank_limit.clone(), grouping_sets: agg.grouping_sets.clone(), }; @@ -177,6 +176,19 @@ impl PhysicalPlanBuilder { } } + let rank_limit = agg.rank_limit.map(|(item, limit)| { + let desc = item + .iter() + .map(|v| SortDesc { + asc: v.asc, + nulls_first: v.nulls_first, + order_by: v.index, + display_name: self.metadata.read().column(v.index).name(), + }) + .collect::>(); + (desc, limit) + }); + match input { PhysicalPlan::Exchange(Exchange { input, kind, .. }) if group_by_shuffle_mode == "before_merge" => @@ -197,6 +209,7 @@ impl PhysicalPlanBuilder { group_by_display, group_by: group_items, stat_info: Some(stat_info), + rank_limit: None, } } else { AggregatePartial { @@ -207,6 +220,7 @@ impl PhysicalPlanBuilder { group_by_display, group_by: group_items, stat_info: Some(stat_info), + rank_limit, } }; @@ -275,6 +289,7 @@ impl PhysicalPlanBuilder { group_by: group_items, input: Box::new(PhysicalPlan::AggregateExpand(expand)), stat_info: Some(stat_info), + rank_limit: None, }) } else { PhysicalPlan::AggregatePartial(AggregatePartial { @@ -285,6 +300,7 @@ impl PhysicalPlanBuilder { group_by: group_items, input: Box::new(input), stat_info: Some(stat_info), + rank_limit, }) } } @@ -358,7 +374,7 @@ impl PhysicalPlanBuilder { match input { PhysicalPlan::AggregatePartial(ref partial) => { let before_group_by_schema = partial.input.output_schema()?; - let limit = agg.limit; + PhysicalPlan::AggregateFinal(AggregateFinal { plan_id: 0, group_by_display: partial.group_by_display.clone(), @@ -368,7 +384,6 @@ impl PhysicalPlanBuilder { before_group_by_schema, stat_info: Some(stat_info), - limit, }) } @@ -377,7 +392,6 @@ impl PhysicalPlanBuilder { .. }) => { let before_group_by_schema = partial.input.output_schema()?; - let limit = agg.limit; PhysicalPlan::AggregateFinal(AggregateFinal { plan_id: 0, @@ -388,7 +402,6 @@ impl PhysicalPlanBuilder { before_group_by_schema, stat_info: Some(stat_info), - limit, }) } diff --git a/src/query/sql/src/executor/physical_plans/physical_aggregate_partial.rs b/src/query/sql/src/executor/physical_plans/physical_aggregate_partial.rs index e6e1dca9cd4e..b47f9f00d031 100644 --- a/src/query/sql/src/executor/physical_plans/physical_aggregate_partial.rs +++ b/src/query/sql/src/executor/physical_plans/physical_aggregate_partial.rs @@ -20,6 +20,7 @@ use databend_common_expression::DataField; use databend_common_expression::DataSchemaRef; use databend_common_expression::DataSchemaRefExt; +use super::SortDesc; use crate::executor::explain::PlanStatsInfo; use crate::executor::physical_plans::common::AggregateFunctionDesc; use crate::executor::PhysicalPlan; @@ -35,6 +36,8 @@ pub struct AggregatePartial { pub enable_experimental_aggregate_hashtable: bool, pub group_by_display: Vec, + // Order by keys if keys are subset of group by key, then we can use rank to filter data in previous + pub rank_limit: Option<(Vec, usize)>, // Only used for explain pub stat_info: Option, } diff --git a/src/query/sql/src/planner/binder/aggregate.rs b/src/query/sql/src/planner/binder/aggregate.rs index 2c5e7be098d4..71d715dcfa68 100644 --- a/src/query/sql/src/planner/binder/aggregate.rs +++ b/src/query/sql/src/planner/binder/aggregate.rs @@ -471,7 +471,8 @@ impl Binder { group_items: agg_info.group_items.clone(), aggregate_functions: agg_info.aggregate_functions.clone(), from_distinct: false, - limit: None, + rank_limit: None, + grouping_sets: agg_info.grouping_sets.as_ref().map(|g| GroupingSets { grouping_id_index: g.grouping_id_column.index, sets: g.sets.clone(), diff --git a/src/query/sql/src/planner/binder/bind_query/bind_value.rs b/src/query/sql/src/planner/binder/bind_query/bind_value.rs index 70b49d7de73f..08aee00a7cb5 100644 --- a/src/query/sql/src/planner/binder/bind_query/bind_value.rs +++ b/src/query/sql/src/planner/binder/bind_query/bind_value.rs @@ -346,10 +346,7 @@ impl Binder { Aggregate { mode: AggregateMode::Initial, group_items, - aggregate_functions: vec![], - from_distinct: false, - limit: None, - grouping_sets: None, + ..Default::default() } .into(), ), diff --git a/src/query/sql/src/planner/binder/binder.rs b/src/query/sql/src/planner/binder/binder.rs index b1c3cb070b9a..fc80aa40f37c 100644 --- a/src/query/sql/src/planner/binder/binder.rs +++ b/src/query/sql/src/planner/binder/binder.rs @@ -617,9 +617,9 @@ impl<'a> Binder { } } Statement::CallProcedure(stmt) => { if self.ctx.get_settings().get_enable_experimental_procedure()? { - self.bind_call_procedure(stmt).await? + self.bind_call_procedure(bind_context, stmt).await? } else { - return Err(ErrorCode::SyntaxException("DESC PROCEDURE, set enable_experimental_procedure=1")); + return Err(ErrorCode::SyntaxException("CALL PROCEDURE, set enable_experimental_procedure=1")); } } }; diff --git a/src/query/sql/src/planner/binder/ddl/procedure.rs b/src/query/sql/src/planner/binder/ddl/procedure.rs index fc5c5be5d4c6..69e32716ae04 100644 --- a/src/query/sql/src/planner/binder/ddl/procedure.rs +++ b/src/query/sql/src/planner/binder/ddl/procedure.rs @@ -30,14 +30,18 @@ use databend_common_meta_app::principal::ProcedureNameIdent; use databend_common_users::UserApiProvider; use crate::binder::show::get_show_options; +use crate::plans::CallProcedurePlan; use crate::plans::CreateProcedurePlan; use crate::plans::DropProcedurePlan; use crate::plans::ExecuteImmediatePlan; use crate::plans::Plan; use crate::plans::RewriteKind; +use crate::plans::SubqueryType; use crate::resolve_type_name; use crate::BindContext; use crate::Binder; +use crate::ScalarExpr; +use crate::TypeChecker; impl Binder { #[async_backtrace::framed] @@ -56,7 +60,7 @@ impl Binder { create_option, name, language, - args: _args, + args, return_type, comment, script, @@ -67,7 +71,7 @@ impl Binder { // 1. need parser name: ProcedureNameIdent = name + args // 2. need check script's return type and stmt.return_type - let meta = self.procedure_meta(return_type, script, comment, language)?; + let meta = self.procedure_meta(return_type, script, comment, language, args)?; Ok(Plan::CreateProcedure(Box::new(CreateProcedurePlan { create_option: create_option.clone().into(), tenant: tenant.to_owned(), @@ -107,22 +111,55 @@ impl Binder { .await } - pub async fn bind_call_procedure(&mut self, stmt: &CallProcedureStmt) -> Result { - let CallProcedureStmt { name, args } = stmt; + pub async fn bind_call_procedure( + &mut self, + bind_context: &mut BindContext, + stmt: &CallProcedureStmt, + ) -> Result { + let CallProcedureStmt { + name, + args: arguments, + } = stmt; let tenant = self.ctx.get_tenant(); - // TODO: ProcedureNameIdent = name + args_type. Need to get type in here. + let mut type_checker = TypeChecker::try_create( + bind_context, + self.ctx.clone(), + &self.name_resolution_ctx, + self.metadata.clone(), + &[], + true, + )?; + let mut arg_types = vec![]; + for argument in arguments { + let box (arg, mut arg_type) = type_checker.resolve(argument)?; + if let ScalarExpr::SubqueryExpr(subquery) = &arg { + if subquery.typ == SubqueryType::Scalar && !arg.data_type()?.is_nullable() { + arg_type = arg_type.wrap_nullable(); + } + } + arg_types.push(arg_type.to_string()); + } let req = GetProcedureReq { inner: ProcedureNameIdent::new( tenant.clone(), - ProcedureIdentity::new(name, args.join(",")), + ProcedureIdentity::new(name, arg_types.join(",")), ), }; + let procedure = UserApiProvider::instance() .get_procedure(&tenant, req) .await?; - Ok(Plan::ExecuteImmediate(Box::new(ExecuteImmediatePlan { - script: procedure.procedure_meta.script, - }))) + if arg_types.is_empty() { + Ok(Plan::ExecuteImmediate(Box::new(ExecuteImmediatePlan { + script: procedure.procedure_meta.script, + }))) + } else { + Ok(Plan::CallProcedure(Box::new(CallProcedurePlan { + script: procedure.procedure_meta.script, + arg_names: procedure.procedure_meta.arg_names, + args: arguments.clone(), + }))) + } } fn procedure_meta( @@ -131,7 +168,16 @@ impl Binder { script: &str, comment: &Option, language: &ProcedureLanguage, + args: &Option>, ) -> Result { + let mut arg_names = vec![]; + if let Some(args) = args { + for arg in args { + if let Some(name) = &arg.name { + arg_names.push(name.to_string()); + } + } + } let mut return_types = Vec::with_capacity(return_type.len()); for arg_type in return_type { return_types.push(DataType::from(&resolve_type_name( @@ -142,6 +188,7 @@ impl Binder { Ok(ProcedureMeta { return_types, + arg_names, created_on: Utc::now(), updated_on: Utc::now(), script: script.to_string(), diff --git a/src/query/sql/src/planner/binder/distinct.rs b/src/query/sql/src/planner/binder/distinct.rs index 82d829f63759..621bc1ea0ca6 100644 --- a/src/query/sql/src/planner/binder/distinct.rs +++ b/src/query/sql/src/planner/binder/distinct.rs @@ -83,10 +83,8 @@ impl Binder { let distinct_plan = Aggregate { mode: AggregateMode::Initial, group_items, - aggregate_functions: vec![], from_distinct: true, - limit: None, - grouping_sets: None, + ..Default::default() }; Ok(SExpr::create_unary( diff --git a/src/query/sql/src/planner/format/display_plan.rs b/src/query/sql/src/planner/format/display_plan.rs index 801cde7a8a55..4f180e6b2959 100644 --- a/src/query/sql/src/planner/format/display_plan.rs +++ b/src/query/sql/src/planner/format/display_plan.rs @@ -194,6 +194,7 @@ impl Plan { Plan::ExecuteImmediate(_) => Ok("ExecuteImmediate".to_string()), Plan::CreateProcedure(_) => Ok("CreateProcedure".to_string()), Plan::DropProcedure(_) => Ok("DropProcedure".to_string()), + Plan::CallProcedure(_) => Ok("CallProcedure".to_string()), // Plan::ShowCreateProcedure(_) => Ok("ShowCreateProcedure".to_string()), // Plan::RenameProcedure(_) => Ok("ProcedureDatabase".to_string()), diff --git a/src/query/sql/src/planner/optimizer/aggregate/normalize_aggregate.rs b/src/query/sql/src/planner/optimizer/aggregate/normalize_aggregate.rs index 96514c2e5e89..bad2a93f5dd6 100644 --- a/src/query/sql/src/planner/optimizer/aggregate/normalize_aggregate.rs +++ b/src/query/sql/src/planner/optimizer/aggregate/normalize_aggregate.rs @@ -119,7 +119,7 @@ impl RuleNormalizeAggregateOptimizer { group_items: aggregate.group_items, aggregate_functions: new_aggregate_functions, from_distinct: aggregate.from_distinct, - limit: aggregate.limit, + rank_limit: aggregate.rank_limit, grouping_sets: aggregate.grouping_sets, }; diff --git a/src/query/sql/src/planner/optimizer/decorrelate/flatten_plan.rs b/src/query/sql/src/planner/optimizer/decorrelate/flatten_plan.rs index 74bd88267650..ac7743020a0f 100644 --- a/src/query/sql/src/planner/optimizer/decorrelate/flatten_plan.rs +++ b/src/query/sql/src/planner/optimizer/decorrelate/flatten_plan.rs @@ -148,10 +148,7 @@ impl SubqueryRewriter { Aggregate { mode: AggregateMode::Initial, group_items, - aggregate_functions: vec![], - from_distinct: false, - limit: None, - grouping_sets: None, + ..Default::default() } .into(), ), @@ -623,7 +620,7 @@ impl SubqueryRewriter { group_items, aggregate_functions: agg_items, from_distinct: aggregate.from_distinct, - limit: aggregate.limit, + rank_limit: aggregate.rank_limit.clone(), grouping_sets: aggregate.grouping_sets.clone(), } .into(), diff --git a/src/query/sql/src/planner/optimizer/decorrelate/subquery_rewriter.rs b/src/query/sql/src/planner/optimizer/decorrelate/subquery_rewriter.rs index 90d1b16f50f3..26edfa3624f2 100644 --- a/src/query/sql/src/planner/optimizer/decorrelate/subquery_rewriter.rs +++ b/src/query/sql/src/planner/optimizer/decorrelate/subquery_rewriter.rs @@ -459,10 +459,7 @@ impl SubqueryRewriter { .into(), index: agg_func_index, }], - from_distinct: false, - mode: AggregateMode::Initial, - limit: None, - grouping_sets: None, + ..Default::default() }; let compare = FunctionCall { @@ -692,9 +689,7 @@ impl SubqueryRewriter { index: any_idx, }, ], - from_distinct: false, - limit: None, - grouping_sets: None, + ..Default::default() } .into(), ), diff --git a/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs b/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs index 262bf06900f2..94cf8dc7009d 100644 --- a/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs +++ b/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs @@ -127,8 +127,6 @@ fn create_count_aggregate(mode: AggregateMode) -> Aggregate { }), index: 0, }], - from_distinct: false, - limit: None, - grouping_sets: None, + ..Default::default() } } diff --git a/src/query/sql/src/planner/optimizer/optimizer.rs b/src/query/sql/src/planner/optimizer/optimizer.rs index 625ce0bd0dd2..b8bbca939a4f 100644 --- a/src/query/sql/src/planner/optimizer/optimizer.rs +++ b/src/query/sql/src/planner/optimizer/optimizer.rs @@ -377,7 +377,6 @@ pub async fn optimize_query(opt_ctx: &mut OptimizerContext, mut s_expr: SExpr) - // After join reorder, Convert some single join to inner join. s_expr = SingleToInnerOptimizer::new().run(&s_expr)?; - // Deduplicate join conditions. s_expr = DeduplicateJoinConditionOptimizer::new().run(&s_expr)?; diff --git a/src/query/sql/src/planner/optimizer/rule/factory.rs b/src/query/sql/src/planner/optimizer/rule/factory.rs index 378cc3b89f73..dfd6e5c5147e 100644 --- a/src/query/sql/src/planner/optimizer/rule/factory.rs +++ b/src/query/sql/src/planner/optimizer/rule/factory.rs @@ -23,9 +23,9 @@ use super::rewrite::RulePushDownFilterAggregate; use super::rewrite::RulePushDownFilterEvalScalar; use super::rewrite::RulePushDownFilterJoin; use super::rewrite::RulePushDownFilterWindow; -use super::rewrite::RulePushDownLimitAggregate; use super::rewrite::RulePushDownLimitEvalScalar; use super::rewrite::RulePushDownPrewhere; +use super::rewrite::RulePushDownRankLimitAggregate; use super::rewrite::RulePushDownSortEvalScalar; use super::rewrite::RuleTryApplyAggIndex; use crate::optimizer::rule::rewrite::RuleEliminateFilter; @@ -82,7 +82,9 @@ impl RuleFactory { RuleID::PushDownLimitWindow => { Ok(Box::new(RulePushDownLimitWindow::new(MAX_PUSH_DOWN_LIMIT))) } - RuleID::PushDownLimitAggregate => Ok(Box::new(RulePushDownLimitAggregate::new())), + RuleID::RulePushDownRankLimitAggregate => { + Ok(Box::new(RulePushDownRankLimitAggregate::new())) + } RuleID::PushDownFilterAggregate => Ok(Box::new(RulePushDownFilterAggregate::new())), RuleID::PushDownFilterWindow => Ok(Box::new(RulePushDownFilterWindow::new())), RuleID::EliminateFilter => Ok(Box::new(RuleEliminateFilter::new(metadata))), diff --git a/src/query/sql/src/planner/optimizer/rule/rewrite/mod.rs b/src/query/sql/src/planner/optimizer/rule/rewrite/mod.rs index 0feaeea4c6fb..5191e4f5f74d 100644 --- a/src/query/sql/src/planner/optimizer/rule/rewrite/mod.rs +++ b/src/query/sql/src/planner/optimizer/rule/rewrite/mod.rs @@ -65,7 +65,7 @@ pub use rule_push_down_filter_sort::RulePushDownFilterSort; pub use rule_push_down_filter_union::RulePushDownFilterUnion; pub use rule_push_down_filter_window::RulePushDownFilterWindow; pub use rule_push_down_limit::RulePushDownLimit; -pub use rule_push_down_limit_aggregate::RulePushDownLimitAggregate; +pub use rule_push_down_limit_aggregate::RulePushDownRankLimitAggregate; pub use rule_push_down_limit_expression::RulePushDownLimitEvalScalar; pub use rule_push_down_limit_join::RulePushDownLimitOuterJoin; pub use rule_push_down_limit_scan::RulePushDownLimitScan; diff --git a/src/query/sql/src/planner/optimizer/rule/rewrite/rule_push_down_limit_aggregate.rs b/src/query/sql/src/planner/optimizer/rule/rewrite/rule_push_down_limit_aggregate.rs index d73fc600f3de..12e74010eb41 100644 --- a/src/query/sql/src/planner/optimizer/rule/rewrite/rule_push_down_limit_aggregate.rs +++ b/src/query/sql/src/planner/optimizer/rule/rewrite/rule_push_down_limit_aggregate.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp; use std::sync::Arc; use crate::optimizer::extract::Matcher; @@ -21,47 +20,75 @@ use crate::optimizer::rule::TransformResult; use crate::optimizer::RuleID; use crate::optimizer::SExpr; use crate::plans::Aggregate; +use crate::plans::AggregateMode; use crate::plans::Limit; +use crate::plans::Operator; use crate::plans::RelOp; use crate::plans::RelOperator; +use crate::plans::Sort; +use crate::plans::SortItem; -/// Input: Limit +/// Input: Limit | Sort /// \ /// Aggregate /// \ /// * /// -/// Output: Limit +/// Output: Limit | Sort /// \ -/// Aggregate(padding limit) +/// Aggregate(padding limit | rank_limit) /// \ /// * -pub struct RulePushDownLimitAggregate { +pub struct RulePushDownRankLimitAggregate { id: RuleID, matchers: Vec, } -impl RulePushDownLimitAggregate { +impl RulePushDownRankLimitAggregate { pub fn new() -> Self { Self { - id: RuleID::PushDownLimitAggregate, - matchers: vec![Matcher::MatchOp { - op_type: RelOp::Limit, - children: vec![Matcher::MatchOp { - op_type: RelOp::Aggregate, - children: vec![Matcher::Leaf], - }], - }], + id: RuleID::RulePushDownRankLimitAggregate, + matchers: vec![ + Matcher::MatchOp { + op_type: RelOp::Limit, + children: vec![Matcher::MatchOp { + op_type: RelOp::Aggregate, + children: vec![Matcher::MatchOp { + op_type: RelOp::Aggregate, + children: vec![Matcher::Leaf], + }], + }], + }, + Matcher::MatchOp { + op_type: RelOp::Sort, + children: vec![Matcher::MatchOp { + op_type: RelOp::Aggregate, + children: vec![Matcher::MatchOp { + op_type: RelOp::Aggregate, + children: vec![Matcher::Leaf], + }], + }], + }, + Matcher::MatchOp { + op_type: RelOp::Sort, + children: vec![Matcher::MatchOp { + op_type: RelOp::EvalScalar, + children: vec![Matcher::MatchOp { + op_type: RelOp::Aggregate, + children: vec![Matcher::MatchOp { + op_type: RelOp::Aggregate, + children: vec![Matcher::Leaf], + }], + }], + }], + }, + ], } } -} - -impl Rule for RulePushDownLimitAggregate { - fn id(&self) -> RuleID { - self.id - } - fn apply( + // There is no order by, so we don't care the order of result. + // To make query works with consistent result and more efficient, we will inject a order by before limit + fn apply_limit( &self, s_expr: &SExpr, state: &mut TransformResult, @@ -69,21 +96,133 @@ impl Rule for RulePushDownLimitAggregate { let limit: Limit = s_expr.plan().clone().try_into()?; if let Some(mut count) = limit.limit { count += limit.offset; - let agg = s_expr.child(0)?; - let mut agg_limit: Aggregate = agg.plan().clone().try_into()?; + let agg_final = s_expr.child(0)?; + let agg_partial = agg_final.child(0)?; + + let mut agg_limit: Aggregate = agg_partial.plan().clone().try_into()?; + + let sort_items = agg_limit + .group_items + .iter() + .map(|g| SortItem { + index: g.index, + asc: true, + nulls_first: false, + }) + .collect::>(); + agg_limit.rank_limit = Some((sort_items.clone(), count)); + + let sort = Sort { + items: sort_items.clone(), + limit: Some(count), + after_exchange: None, + pre_projection: None, + window_partition: vec![], + }; + + let agg_partial = SExpr::create_unary( + Arc::new(RelOperator::Aggregate(agg_limit)), + Arc::new(agg_partial.child(0)?.clone()), + ); + let agg_final = agg_final.replace_children(vec![agg_partial.into()]); + let sort = SExpr::create_unary(Arc::new(RelOperator::Sort(sort)), agg_final.into()); + let mut result = s_expr.replace_children(vec![Arc::new(sort)]); + + result.set_applied_rule(&self.id); + state.add_result(result); + } + Ok(()) + } + + fn apply_sort( + &self, + s_expr: &SExpr, + state: &mut TransformResult, + ) -> databend_common_exception::Result<()> { + let sort: Sort = s_expr.plan().clone().try_into()?; + let mut has_eval_scalar = false; + let agg = match s_expr.child(0)?.plan().rel_op() { + RelOp::Aggregate => s_expr.child(0)?, + RelOp::EvalScalar => { + has_eval_scalar = true; + s_expr.child(0)?.child(0)? + } + _ => return Ok(()), + }; + + let agg_limit_expr = agg.child(0)?; + let mut agg_limit: Aggregate = agg_limit_expr.plan().clone().try_into()?; + + if agg_limit.mode != AggregateMode::Partial { + return Ok(()); + } + + if let Some(limit) = sort.limit { + let is_order_subset = sort + .items + .iter() + .all(|k| agg_limit.group_items.iter().any(|g| g.index == k.index)); - agg_limit.limit = Some(agg_limit.limit.map_or(count, |c| cmp::max(c, count))); - let agg = SExpr::create_unary( + if !is_order_subset { + return Ok(()); + } + let mut sort_items = Vec::with_capacity(agg_limit.group_items.len()); + let mut not_found_sort_items = vec![]; + for i in 0..agg_limit.group_items.len() { + let group_item = &agg_limit.group_items[i]; + if let Some(sort_item) = sort.items.iter().find(|k| k.index == group_item.index) { + sort_items.push(SortItem { + index: group_item.index, + asc: sort_item.asc, + nulls_first: sort_item.nulls_first, + }); + } else { + not_found_sort_items.push(SortItem { + index: group_item.index, + asc: true, + nulls_first: false, + }); + } + } + sort_items.extend(not_found_sort_items); + + agg_limit.rank_limit = Some((sort_items, limit)); + + let agg_partial = SExpr::create_unary( Arc::new(RelOperator::Aggregate(agg_limit)), - Arc::new(agg.child(0)?.clone()), + Arc::new(agg_limit_expr.child(0)?.clone()), ); - let mut result = s_expr.replace_children(vec![Arc::new(agg)]); + let agg = agg.replace_children(vec![Arc::new(agg_partial)]); + let mut result = if has_eval_scalar { + let eval_scalar = s_expr.child(0)?.replace_children(vec![Arc::new(agg)]); + s_expr.replace_children(vec![Arc::new(eval_scalar)]) + } else { + s_expr.replace_children(vec![Arc::new(agg)]) + }; result.set_applied_rule(&self.id); state.add_result(result); } Ok(()) } +} + +impl Rule for RulePushDownRankLimitAggregate { + fn id(&self) -> RuleID { + self.id + } + + fn apply( + &self, + s_expr: &SExpr, + state: &mut TransformResult, + ) -> databend_common_exception::Result<()> { + match s_expr.plan().rel_op() { + RelOp::Limit => self.apply_limit(s_expr, state), + RelOp::Sort | RelOp::EvalScalar => self.apply_sort(s_expr, state), + _ => Ok(()), + } + } fn matchers(&self) -> &[Matcher] { &self.matchers diff --git a/src/query/sql/src/planner/optimizer/rule/rule.rs b/src/query/sql/src/planner/optimizer/rule/rule.rs index 95355c9f4038..d99cf67fc252 100644 --- a/src/query/sql/src/planner/optimizer/rule/rule.rs +++ b/src/query/sql/src/planner/optimizer/rule/rule.rs @@ -47,7 +47,7 @@ pub static DEFAULT_REWRITE_RULES: LazyLock> = LazyLock::new(|| { RuleID::PushDownLimitEvalScalar, RuleID::PushDownLimitSort, RuleID::PushDownLimitWindow, - RuleID::PushDownLimitAggregate, + RuleID::RulePushDownRankLimitAggregate, RuleID::PushDownLimitOuterJoin, RuleID::PushDownLimitScan, RuleID::SemiToInnerJoin, @@ -95,7 +95,7 @@ pub enum RuleID { PushDownLimitEvalScalar, PushDownLimitSort, PushDownLimitWindow, - PushDownLimitAggregate, + RulePushDownRankLimitAggregate, PushDownLimitScan, PushDownSortEvalScalar, PushDownSortScan, @@ -131,7 +131,7 @@ impl Display for RuleID { RuleID::PushDownLimitOuterJoin => write!(f, "PushDownLimitOuterJoin"), RuleID::PushDownLimitEvalScalar => write!(f, "PushDownLimitEvalScalar"), RuleID::PushDownLimitSort => write!(f, "PushDownLimitSort"), - RuleID::PushDownLimitAggregate => write!(f, "PushDownLimitAggregate"), + RuleID::RulePushDownRankLimitAggregate => write!(f, "RulePushDownRankLimitAggregate"), RuleID::PushDownFilterAggregate => write!(f, "PushDownFilterAggregate"), RuleID::PushDownLimitScan => write!(f, "PushDownLimitScan"), RuleID::PushDownSortScan => write!(f, "PushDownSortScan"), diff --git a/src/query/sql/src/planner/plans/aggregate.rs b/src/query/sql/src/planner/plans/aggregate.rs index cdec39872124..89b623aa19f3 100644 --- a/src/query/sql/src/planner/plans/aggregate.rs +++ b/src/query/sql/src/planner/plans/aggregate.rs @@ -27,6 +27,7 @@ use crate::optimizer::RelationalProperty; use crate::optimizer::RequiredProperty; use crate::optimizer::StatInfo; use crate::optimizer::Statistics; +use crate::plans::sort::SortItem; use crate::plans::Operator; use crate::plans::RelOp; use crate::plans::ScalarItem; @@ -63,10 +64,24 @@ pub struct Aggregate { pub aggregate_functions: Vec, // True if the plan is generated from distinct, else the plan is a normal aggregate; pub from_distinct: bool, - pub limit: Option, + pub rank_limit: Option<(Vec, usize)>, + pub grouping_sets: Option, } +impl Default for Aggregate { + fn default() -> Self { + Self { + mode: AggregateMode::Initial, + group_items: vec![], + aggregate_functions: vec![], + from_distinct: false, + rank_limit: None, + grouping_sets: None, + } + } +} + impl Aggregate { pub fn used_columns(&self) -> Result { let mut used_columns = ColumnSet::new(); diff --git a/src/query/sql/src/planner/plans/ddl/procedure.rs b/src/query/sql/src/planner/plans/ddl/procedure.rs index b8ab6e4de2d3..ca38ad6e3d94 100644 --- a/src/query/sql/src/planner/plans/ddl/procedure.rs +++ b/src/query/sql/src/planner/plans/ddl/procedure.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use databend_common_ast::ast::Expr; use databend_common_expression::types::DataType; use databend_common_expression::DataField; use databend_common_expression::DataSchemaRef; @@ -86,3 +87,17 @@ impl From<&DropProcedurePlan> for DropProcedureReq { } } } + +#[derive(Clone, Debug, PartialEq)] +pub struct CallProcedurePlan { + pub script: String, + pub arg_names: Vec, + pub args: Vec, +} + +impl CallProcedurePlan { + pub fn schema(&self) -> DataSchemaRef { + // TODO: schema generated by plan.arg_names + DataSchemaRefExt::create(vec![DataField::new("Result", DataType::String)]) + } +} diff --git a/src/query/sql/src/planner/plans/plan.rs b/src/query/sql/src/planner/plans/plan.rs index 8655d7f66d62..f3b25ff99d15 100644 --- a/src/query/sql/src/planner/plans/plan.rs +++ b/src/query/sql/src/planner/plans/plan.rs @@ -41,6 +41,7 @@ use crate::plans::AlterUserPlan; use crate::plans::AlterViewPlan; use crate::plans::AlterVirtualColumnPlan; use crate::plans::AnalyzeTablePlan; +use crate::plans::CallProcedurePlan; use crate::plans::CopyIntoTableMode; use crate::plans::CopyIntoTablePlan; use crate::plans::CreateCatalogPlan; @@ -355,6 +356,7 @@ pub enum Plan { // ShowCreateProcedure(Box), DropProcedure(Box), CreateProcedure(Box), + CallProcedure(Box), // RenameProcedure(Box), // sequence @@ -478,6 +480,7 @@ impl Plan { Plan::DescConnection(plan) => plan.schema(), Plan::ShowConnections(plan) => plan.schema(), Plan::ExecuteImmediate(plan) => plan.schema(), + Plan::CallProcedure(plan) => plan.schema(), Plan::InsertMultiTable(plan) => plan.schema(), _ => Arc::new(DataSchema::empty()), diff --git a/src/query/sql/src/planner/semantic/type_check.rs b/src/query/sql/src/planner/semantic/type_check.rs index 0f5bedf86789..057048e69516 100644 --- a/src/query/sql/src/planner/semantic/type_check.rs +++ b/src/query/sql/src/planner/semantic/type_check.rs @@ -4337,10 +4337,7 @@ impl<'a> TypeChecker<'a> { }), index: self.metadata.read().columns().len() - 1, }], - aggregate_functions: vec![], - from_distinct: false, - limit: None, - grouping_sets: None, + ..Default::default() } .into(), ), diff --git a/src/query/storages/common/index/Cargo.toml b/src/query/storages/common/index/Cargo.toml index 1064768e2cb6..0ca11e3627c3 100644 --- a/src/query/storages/common/index/Cargo.toml +++ b/src/query/storages/common/index/Cargo.toml @@ -25,13 +25,15 @@ databend-common-functions = { workspace = true } databend-storages-common-table-meta = { workspace = true } fastrace = { workspace = true } jsonb = { workspace = true } +levenshtein_automata = "0.2.1" log = { workspace = true } match-template = { workspace = true } parquet = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tantivy = { workspace = true } -tantivy-common = "0.7.0" +tantivy-common = { git = "https://github.com/b41sh/tantivy", rev = "37aeac0", package = "tantivy-common" } +tantivy-fst = "0.5" thiserror = { workspace = true } xorfilter-rs = { workspace = true, features = ["cbordata"] } diff --git a/src/query/storages/common/index/src/inverted_index.rs b/src/query/storages/common/index/src/inverted_index.rs index e1015cf0685d..6d8db4c5b424 100644 --- a/src/query/storages/common/index/src/inverted_index.rs +++ b/src/query/storages/common/index/src/inverted_index.rs @@ -34,6 +34,9 @@ // IN THE SOFTWARE. use std::collections::BTreeMap; +use std::collections::HashMap; +use std::collections::HashSet; +use std::collections::VecDeque; use std::io; use std::io::BufWriter; use std::io::Cursor; @@ -48,9 +51,15 @@ use std::sync::Arc; use crc32fast::Hasher; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::Scalar; +use databend_common_expression::TableDataType; +use databend_common_expression::TableField; use databend_storages_common_table_meta::meta::testify_version; use databend_storages_common_table_meta::meta::SingleColumnMeta; use databend_storages_common_table_meta::meta::Versioned; +use levenshtein_automata::Distance; +use levenshtein_automata::LevenshteinAutomatonBuilder; +use levenshtein_automata::DFA; use log::warn; use tantivy::directory::error::DeleteError; use tantivy::directory::error::OpenReadError; @@ -63,9 +72,23 @@ use tantivy::directory::TerminatingWrite; use tantivy::directory::WatchCallback; use tantivy::directory::WatchHandle; use tantivy::directory::WritePtr; +use tantivy::positions::PositionReader; +use tantivy::postings::TermInfo; +use tantivy::query::BooleanQuery; +use tantivy::query::FuzzyTermQuery; +use tantivy::query::Occur; +use tantivy::query::PhraseQuery; +use tantivy::query::Query; +use tantivy::query::QueryClone; +use tantivy::query::TermQuery; use tantivy::Directory; +use tantivy::Term; use tantivy_common::BinarySerializable; +use tantivy_common::HasLen; use tantivy_common::VInt; +use tantivy_fst::Automaton; +use tantivy_fst::IntoStreamer; +use tantivy_fst::Streamer; // tantivy version is used to generate the footer data @@ -119,6 +142,104 @@ impl Footer { } } +fn extract_footer(data: FileSlice) -> Result<(Vec, Vec)> { + // The following code is copied from tantivy `CompositeFile::open` function. + // extract field number and offsets of each fields. + let end = data.len(); + let footer_len_data = data.slice_from(end - 4).read_bytes()?; + let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize; + let footer_start = end - 4 - footer_len; + let footer_data = data + .slice(footer_start..footer_start + footer_len) + .read_bytes()?; + let mut footer_buffer = footer_data.as_slice(); + let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize; + + let mut offset = 0; + let mut offsets = Vec::with_capacity(num_fields); + let mut file_addrs = Vec::with_capacity(num_fields); + for _ in 0..num_fields { + offset += VInt::deserialize(&mut footer_buffer)?.0 as usize; + offsets.push(offset); + let file_addr = FileAddr::deserialize(&mut footer_buffer)?; + file_addrs.push(file_addr); + } + offsets.push(footer_start); + + Ok((file_addrs, offsets)) +} + +// Extract fsts and term dicts into separate columns. +pub fn extract_fsts( + data: FileSlice, + fields: &mut Vec, + values: &mut Vec, +) -> Result<()> { + let (file_addrs, offsets) = extract_footer(data.clone())?; + + let mut term_dict_fields = Vec::with_capacity(file_addrs.len()); + let mut term_dict_values = Vec::with_capacity(file_addrs.len()); + for (i, file_addr) in file_addrs.iter().enumerate() { + let field_id = file_addr.field.field_id(); + let start_offset = offsets[i]; + let end_offset = offsets[i + 1]; + + let field_slice = data.slice(start_offset..end_offset); + + let (main_slice, footer_len_slice) = field_slice.split_from_end(16); + let mut footer_len_bytes = footer_len_slice.read_bytes()?; + let footer_size = u64::deserialize(&mut footer_len_bytes)?; + + let (fst_file_slice, term_dict_file_slice) = + main_slice.split_from_end(footer_size as usize); + + let fst_field_name = format!("fst-{}", field_id); + let fst_field = TableField::new(&fst_field_name, TableDataType::Binary); + fields.push(fst_field); + + let fst_bytes = fst_file_slice.read_bytes()?; + values.push(Scalar::Binary(fst_bytes.as_slice().to_vec())); + + let term_dict_field_name = format!("term-{}", field_id); + let term_dict_field = TableField::new(&term_dict_field_name, TableDataType::Binary); + term_dict_fields.push(term_dict_field); + + let term_dict_bytes = term_dict_file_slice.read_bytes()?; + term_dict_values.push(Scalar::Binary(term_dict_bytes.as_slice().to_vec())); + } + + fields.append(&mut term_dict_fields); + values.append(&mut term_dict_values); + + Ok(()) +} + +// Extract component file into separate columns by fields. +pub fn extract_component_fields( + name: &str, + data: FileSlice, + fields: &mut Vec, + values: &mut Vec, +) -> Result<()> { + let (file_addrs, offsets) = extract_footer(data.clone())?; + + for (i, file_addr) in file_addrs.iter().enumerate() { + let field_id = file_addr.field.field_id(); + let start_offset = offsets[i]; + let end_offset = offsets[i + 1]; + + let field_name = format!("{}-{}", name, field_id); + let field = TableField::new(&field_name, TableDataType::Binary); + fields.push(field); + + let field_slice = data.slice(start_offset..end_offset); + let field_bytes = field_slice.read_bytes()?; + values.push(Scalar::Binary(field_bytes.as_slice().to_vec())); + } + + Ok(()) +} + // Build footer for tantivy files. // Footer is used to check whether the data is valid when open a file. pub fn build_tantivy_footer(bytes: &[u8]) -> Result> { @@ -216,6 +337,382 @@ fn build_empty_position_data(field_nums: usize) -> Result { Ok(OwnedBytes::new(buf)) } +struct DfaWrapper(pub DFA); + +impl Automaton for DfaWrapper { + type State = u32; + + fn start(&self) -> Self::State { + self.0.initial_state() + } + + fn is_match(&self, state: &Self::State) -> bool { + match self.0.distance(*state) { + Distance::Exact(_) => true, + Distance::AtLeast(_) => false, + } + } + + fn can_match(&self, state: &u32) -> bool { + *state != levenshtein_automata::SINK_STATE + } + + fn accept(&self, state: &Self::State, byte: u8) -> Self::State { + self.0.transition(*state, byte) + } +} + +// Term value contains values associated with a Term +// used to match query and collect matched doc ids. +#[derive(Clone)] +pub struct TermValue { + // term info + pub term_info: TermInfo, + // term matched doc ids + pub doc_ids: Vec, + // term frequencies for each doc + pub term_freqs: Vec, + // position reader is used to read positions in doc for phrase query + pub position_reader: Option, +} + +// Check if fst contains terms in query. +// If not, we can skip read other parts of inverted index. +pub fn check_term_fsts_match( + query: Box, + fst_maps: &HashMap>, + fuzziness: &Option, + matched_terms: &mut HashMap, + fuzziness_terms: &mut HashMap>, +) -> bool { + if let Some(term_query) = query.downcast_ref::() { + let term = term_query.term(); + let field = term.field(); + let field_id = field.field_id() as usize; + if let Some(fst_map) = fst_maps.get(&field_id) { + if let Some(idx) = fst_map.get(term.serialized_value_bytes()) { + matched_terms.insert(term.clone(), idx); + return true; + } + } + false + } else if let Some(bool_query) = query.downcast_ref::() { + let mut matched_num = 0; + for (occur, sub_query) in bool_query.clauses() { + let matched = check_term_fsts_match( + sub_query.box_clone(), + fst_maps, + fuzziness, + matched_terms, + fuzziness_terms, + ); + if matched { + matched_num += 1; + } + match occur { + Occur::Should => {} + Occur::Must => { + if !matched { + return false; + } + } + Occur::MustNot => {} + } + } + matched_num > 0 + } else if let Some(phrase_query) = query.downcast_ref::() { + // PhraseQuery must match all terms. + let field = phrase_query.field(); + let field_id = field.field_id() as usize; + if let Some(fst_map) = fst_maps.get(&field_id) { + let mut matched_all = true; + for term in phrase_query.phrase_terms() { + let matched = if let Some(idx) = fst_map.get(term.serialized_value_bytes()) { + matched_terms.insert(term.clone(), idx); + true + } else { + false + }; + if !matched { + matched_all = false; + break; + } + } + matched_all + } else { + false + } + } else if let Some(fuzzy_term_query) = query.downcast_ref::() { + // FuzzyTermQuery match terms by levenshtein distance. + let fuzziness = fuzziness.unwrap(); + + let term = fuzzy_term_query.term(); + let field = term.field(); + let field_id = field.field_id() as usize; + if let Some(fst_map) = fst_maps.get(&field_id) { + // build levenshtein automaton + let lev_automaton_builder = LevenshteinAutomatonBuilder::new(fuzziness, true); + let term_str = String::from_utf8_lossy(term.serialized_value_bytes()); + let automaton = DfaWrapper(lev_automaton_builder.build_dfa(&term_str)); + + let mut fuzz_term_values = vec![]; + let mut stream = fst_map.search(automaton).into_stream(); + while let Some((key, idx)) = stream.next() { + let key_str = unsafe { std::str::from_utf8_unchecked(key) }; + let fuzz_term = Term::from_field_text(field, key_str); + matched_terms.insert(fuzz_term.clone(), idx); + fuzz_term_values.push(fuzz_term); + } + let matched = !fuzz_term_values.is_empty(); + fuzziness_terms.insert(term.clone(), fuzz_term_values); + matched + } else { + false + } + } else { + // TODO: handle other Query types + let mut matched = false; + query.query_terms(&mut |term, _| { + let field = term.field(); + let field_id = field.field_id() as usize; + if let Some(fst_map) = fst_maps.get(&field_id) { + if let Some(idx) = fst_map.get(term.serialized_value_bytes()) { + matched_terms.insert(term.clone(), idx); + matched = true; + } + } + }); + + matched + } +} + +// collect matched rows by term value +pub fn collect_matched_rows( + query: Box, + row_count: u32, + fuzziness_terms: &HashMap>, + term_values: &mut HashMap, +) -> Vec { + if let Some(term_query) = query.downcast_ref::() { + let term = term_query.term(); + if let Some(term_value) = term_values.get(term) { + term_value.doc_ids.clone() + } else { + vec![] + } + } else if let Some(bool_query) = query.downcast_ref::() { + let mut should_doc_ids_opt = None; + let mut must_doc_ids_opt = None; + let mut must_not_doc_ids_opt = None; + for (occur, sub_query) in bool_query.clauses() { + let doc_ids = collect_matched_rows( + sub_query.box_clone(), + row_count, + fuzziness_terms, + term_values, + ); + let doc_id_set = HashSet::from_iter(doc_ids.into_iter()); + match occur { + Occur::Should => { + if should_doc_ids_opt.is_none() { + should_doc_ids_opt = Some(doc_id_set); + } else { + let should_doc_ids = should_doc_ids_opt.unwrap(); + should_doc_ids_opt = + Some(should_doc_ids.union(&doc_id_set).copied().collect()) + } + } + Occur::Must => { + if must_doc_ids_opt.is_none() { + must_doc_ids_opt = Some(doc_id_set); + } else { + let must_doc_ids = must_doc_ids_opt.unwrap(); + must_doc_ids_opt = + Some(must_doc_ids.intersection(&doc_id_set).copied().collect()) + } + } + Occur::MustNot => { + if must_not_doc_ids_opt.is_none() { + must_not_doc_ids_opt = Some(doc_id_set); + } else { + let must_not_doc_ids = must_not_doc_ids_opt.unwrap(); + must_not_doc_ids_opt = + Some(must_not_doc_ids.union(&doc_id_set).copied().collect()) + } + } + } + } + + let doc_ids = if let Some(mut should_doc_ids) = should_doc_ids_opt { + if let Some(must_doc_ids) = must_doc_ids_opt { + should_doc_ids = should_doc_ids + .intersection(&must_doc_ids) + .copied() + .collect() + } + if let Some(must_not_doc_ids) = must_not_doc_ids_opt { + should_doc_ids = should_doc_ids + .difference(&must_not_doc_ids) + .copied() + .collect() + } + should_doc_ids + } else if let Some(mut must_doc_ids) = must_doc_ids_opt { + if let Some(must_not_doc_ids) = must_not_doc_ids_opt { + must_doc_ids = must_doc_ids + .difference(&must_not_doc_ids) + .copied() + .collect() + } + must_doc_ids + } else if let Some(must_not_doc_ids) = must_not_doc_ids_opt { + let all_doc_ids = HashSet::from_iter(0..row_count); + let doc_ids = all_doc_ids.difference(&must_not_doc_ids).copied().collect(); + doc_ids + } else { + HashSet::new() + }; + + let mut doc_ids = Vec::from_iter(doc_ids); + doc_ids.sort(); + doc_ids + } else if let Some(phrase_query) = query.downcast_ref::() { + let mut union_doc_ids = HashSet::new(); + let mut intersection_doc_ids_opt = None; + + for term in phrase_query.phrase_terms() { + if let Some(term_value) = term_values.get(&term) { + let doc_id_set = HashSet::from_iter(term_value.doc_ids.clone()); + union_doc_ids = union_doc_ids.union(&doc_id_set).copied().collect(); + if intersection_doc_ids_opt.is_none() { + intersection_doc_ids_opt = Some(doc_id_set); + } else { + let intersection_doc_ids = intersection_doc_ids_opt.unwrap(); + intersection_doc_ids_opt = Some( + intersection_doc_ids + .intersection(&doc_id_set) + .copied() + .collect(), + ); + } + } + } + + let intersection_doc_ids = intersection_doc_ids_opt.unwrap_or_default(); + if intersection_doc_ids.is_empty() { + return vec![]; + } + let mut union_doc_ids = Vec::from_iter(union_doc_ids); + union_doc_ids.sort(); + + // check each docs + let mut matched_doc_ids = vec![]; + for doc_id in union_doc_ids { + if !intersection_doc_ids.contains(&doc_id) { + continue; + } + + let mut term_pos_map = HashMap::new(); + for term in phrase_query.phrase_terms() { + let mut offset = 0; + let mut term_freq = 0; + if let Some(term_value) = term_values.get_mut(&term) { + for i in 0..term_value.doc_ids.len() { + if term_value.doc_ids[i] < doc_id { + offset += term_value.term_freqs[i] as u64; + } else { + term_freq = term_value.term_freqs[i] as usize; + break; + } + } + // collect positions in the docs + if let Some(position_reader) = term_value.position_reader.as_mut() { + let mut pos_output = vec![0; term_freq]; + position_reader.read(offset, &mut pos_output[..]); + for i in 1..pos_output.len() { + pos_output[i] += pos_output[i - 1]; + } + let positions = VecDeque::from_iter(pos_output); + term_pos_map.insert(term.clone(), positions); + } + } + } + + let mut is_first = true; + let mut distance = 0; + let mut matched = true; + let mut last_position = 0; + for (query_position, term) in phrase_query.phrase_terms_with_offsets() { + if let Some(positions) = term_pos_map.get_mut(&term) { + let mut find_position = false; + while let Some(doc_position) = positions.pop_front() { + // skip previous positions. + if doc_position < last_position { + continue; + } + last_position = doc_position; + let doc_distance = doc_position - (query_position as u32); + if is_first { + is_first = false; + distance = doc_distance; + } else { + // distance must same as first term. + if doc_distance != distance { + matched = false; + } + } + find_position = true; + break; + } + if !find_position { + matched = false; + } + } else { + matched = false; + } + if !matched { + break; + } + } + if matched { + matched_doc_ids.push(doc_id); + } + } + matched_doc_ids + } else if let Some(fuzzy_term_query) = query.downcast_ref::() { + let mut fuzz_doc_ids = HashSet::new(); + let term = fuzzy_term_query.term(); + + // collect related terms of the original term. + if let Some(related_terms) = fuzziness_terms.get(term) { + for term in related_terms { + if let Some(term_value) = term_values.get(term) { + let doc_id_set: HashSet = HashSet::from_iter(term_value.doc_ids.clone()); + fuzz_doc_ids = fuzz_doc_ids.union(&doc_id_set).copied().collect(); + } + } + let mut doc_ids = Vec::from_iter(fuzz_doc_ids); + doc_ids.sort(); + doc_ids + } else { + vec![] + } + } else { + let mut union_doc_ids = HashSet::new(); + query.query_terms(&mut |term, _| { + if let Some(term_value) = term_values.get(term) { + let doc_id_set: HashSet = HashSet::from_iter(term_value.doc_ids.clone()); + union_doc_ids = union_doc_ids.union(&doc_id_set).copied().collect(); + } + }); + + let mut doc_ids = Vec::from_iter(union_doc_ids); + doc_ids.sort(); + doc_ids + } +} + #[derive(Clone)] pub struct InvertedIndexMeta { pub columns: Vec<(String, SingleColumnMeta)>, diff --git a/src/query/storages/common/index/src/lib.rs b/src/query/storages/common/index/src/lib.rs index c2a7bd08eeb5..f60a1b16982e 100644 --- a/src/query/storages/common/index/src/lib.rs +++ b/src/query/storages/common/index/src/lib.rs @@ -27,9 +27,14 @@ pub use bloom_index::BloomIndexMeta; pub use bloom_index::FilterEvalResult; pub use index::Index; pub use inverted_index::build_tantivy_footer; +pub use inverted_index::check_term_fsts_match; +pub use inverted_index::collect_matched_rows; +pub use inverted_index::extract_component_fields; +pub use inverted_index::extract_fsts; pub use inverted_index::InvertedIndexDirectory; pub use inverted_index::InvertedIndexFile; pub use inverted_index::InvertedIndexMeta; +pub use inverted_index::TermValue; pub use page_index::PageIndex; pub use range_index::statistics_to_domain; pub use range_index::RangeIndex; diff --git a/src/query/storages/fuse/Cargo.toml b/src/query/storages/fuse/Cargo.toml index 9b9b26a4da33..ab86128ebef2 100644 --- a/src/query/storages/fuse/Cargo.toml +++ b/src/query/storages/fuse/Cargo.toml @@ -14,6 +14,7 @@ test = true ahash = "0.8.3" arrow = { workspace = true } arrow-array = { workspace = true } +arrow-ipc = { workspace = true } async-backtrace = { workspace = true } async-channel = "1.7.1" async-trait = { workspace = true } @@ -64,7 +65,8 @@ sha2 = { workspace = true } siphasher = "0.3.10" sys-info = "0.9" tantivy = { workspace = true } -tantivy-jieba = "0.11.0" +tantivy-fst = "0.5" +tantivy-jieba = { git = "https://github.com/b41sh/tantivy-jieba", rev = "af84361" } thrift = "0.17.0" typetag = { workspace = true } uuid = { workspace = true } diff --git a/src/query/storages/fuse/src/io/mod.rs b/src/query/storages/fuse/src/io/mod.rs index 738c816dcf23..06bb6b8b3d4d 100644 --- a/src/query/storages/fuse/src/io/mod.rs +++ b/src/query/storages/fuse/src/io/mod.rs @@ -37,6 +37,7 @@ pub use segments::SegmentsIO; pub use segments::SerializedSegment; pub use snapshots::SnapshotLiteExtended; pub use snapshots::SnapshotsIO; +pub(crate) use write::block_to_inverted_index; pub(crate) use write::create_index_schema; pub(crate) use write::create_inverted_index_builders; pub(crate) use write::create_tokenizer_manager; diff --git a/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs b/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs index 40b5ca1928cf..85dd7ddbb132 100644 --- a/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs +++ b/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs @@ -99,7 +99,8 @@ async fn load_bloom_filter_by_columns<'a>( for column_name in column_needed { for (idx, (name, column_meta)) in index_column_chunk_metas.iter().enumerate() { if name == column_name { - col_metas.push((idx as ColumnId, (name, column_meta))) + col_metas.push((idx as ColumnId, (name, column_meta))); + break; } } } diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs index df2ffa89cecc..411dc316b98b 100644 --- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs +++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs @@ -74,7 +74,10 @@ where /// Loads inverted index meta data /// read data from cache, or populate cache items if possible #[fastrace::trace] -async fn load_inverted_index_meta(dal: Operator, path: &str) -> Result> { +pub(crate) async fn load_inverted_index_meta( + dal: Operator, + path: &str, +) -> Result> { let path_owned = path.to_owned(); async move { let reader = MetaReaders::inverted_index_meta_reader(dal); @@ -96,45 +99,41 @@ async fn load_inverted_index_meta(dal: Operator, path: &str) -> Result( - index_path: &'a str, +pub(crate) async fn load_inverted_index_file<'a>( name: &'a str, col_meta: &'a SingleColumnMeta, - need_position: bool, + index_path: &'a str, dal: &'a Operator, ) -> Result> { - // Because the position file is relatively large, reading it will take more time. - // And position data is only used when the query has phrase terms. - // If the query has no phrase terms, we can ignore it and use empty position data instead. - if name == "pos" && !need_position { - let file = InvertedIndexFile::try_create(name.to_owned(), vec![])?; - return Ok(Arc::new(file)); - } - + let start = Instant::now(); let storage_runtime = GlobalIORuntime::instance(); - let file = { - let inverted_index_file_reader = InvertedIndexFileReader::new( + let bytes = { + let column_data_reader = InvertedIndexFileReader::new( index_path.to_owned(), name.to_owned(), col_meta, dal.clone(), ); - async move { inverted_index_file_reader.read().await } + async move { column_data_reader.read().await } } .execute_in_runtime(&storage_runtime) .await??; - Ok(file) + + // Perf. + { + metrics_inc_block_inverted_index_read_milliseconds(start.elapsed().as_millis() as u64); + } + + Ok(bytes) } /// load inverted index directory #[fastrace::trace] pub(crate) async fn load_inverted_index_directory<'a>( dal: Operator, - need_position: bool, field_nums: usize, index_path: &'a str, ) -> Result { - let start = Instant::now(); // load inverted index meta, contains the offsets of each files. let inverted_index_meta = load_inverted_index_meta(dal.clone(), index_path).await?; @@ -150,25 +149,18 @@ pub(crate) async fn load_inverted_index_directory<'a>( let futs = inverted_index_meta .columns .iter() - .map(|(name, column_meta)| { - load_inverted_index_file(index_path, name, column_meta, need_position, &dal) - }) + .map(|(name, column_meta)| load_inverted_index_file(name, column_meta, index_path, &dal)) .collect::>(); let files: Vec<_> = try_join_all(futs).await?.into_iter().collect(); // use those files to create inverted index directory let directory = InvertedIndexDirectory::try_create(field_nums, files)?; - // Perf. - { - metrics_inc_block_inverted_index_read_milliseconds(start.elapsed().as_millis() as u64); - } - Ok(directory) } /// Read the inverted index file data. -pub struct InvertedIndexFileReader { +pub(crate) struct InvertedIndexFileReader { cached_reader: CachedReader, param: LoadParams, } @@ -235,13 +227,13 @@ pub struct InvertedIndexFileLoader { impl Loader for InvertedIndexFileLoader { #[async_backtrace::framed] async fn load(&self, params: &LoadParams) -> Result { - let bytes = self + let buffer = self .operator .read_with(¶ms.location) .range(self.offset..self.offset + self.len) .await?; - InvertedIndexFile::try_create(self.name.clone(), bytes.to_vec()) + InvertedIndexFile::try_create(self.name.clone(), buffer.to_vec()) } fn cache_key(&self, _params: &LoadParams) -> CacheKey { diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs index 20b8950f27de..e3baee6c0c0f 100644 --- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs +++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs @@ -12,60 +12,146 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; +use std::collections::HashSet; +use std::sync::Arc; use std::time::Instant; use databend_common_exception::Result; use databend_common_expression::types::F32; use databend_common_metrics::storage::metrics_inc_block_inverted_index_search_milliseconds; -use databend_storages_common_index::InvertedIndexDirectory; +use databend_storages_common_index::check_term_fsts_match; +use databend_storages_common_index::collect_matched_rows; +use databend_storages_common_index::TermValue; +use databend_storages_common_table_meta::meta::SingleColumnMeta; +use futures_util::future::try_join_all; use opendal::Operator; use tantivy::collector::DocSetCollector; use tantivy::collector::TopDocs; +use tantivy::directory::FileSlice; +use tantivy::directory::OwnedBytes; +use tantivy::positions::PositionReader; +use tantivy::postings::BlockSegmentPostings; use tantivy::query::Query; +use tantivy::query::QueryClone; +use tantivy::schema::IndexRecordOption; +use tantivy::termdict::TermInfoStore; use tantivy::tokenizer::TokenizerManager; use tantivy::Index; +use tantivy_fst::raw::Fst; use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_directory; +use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_file; +use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_meta; use crate::io::read::inverted_index::inverted_index_loader::InvertedIndexFileReader; #[derive(Clone)] pub struct InvertedIndexReader { - directory: InvertedIndexDirectory, + dal: Operator, } impl InvertedIndexReader { - pub async fn try_create( - dal: Operator, - field_nums: usize, - need_position: bool, - index_loc: &str, - ) -> Result { - let directory = - load_inverted_index_directory(dal.clone(), need_position, field_nums, index_loc) - .await?; - - Ok(Self { directory }) + pub fn create(dal: Operator) -> Self { + Self { dal } } // Filter the rows and scores in the block that can match the query text, // if there is no row that can match, this block can be pruned. #[allow(clippy::type_complexity)] - pub fn do_filter( + #[allow(clippy::too_many_arguments)] + pub async fn do_filter( self, + field_nums: usize, + need_position: bool, has_score: bool, - query: &dyn Query, + query: Box, + field_ids: &HashSet, + index_record: &IndexRecordOption, + fuzziness: &Option, tokenizer_manager: TokenizerManager, - row_count: u64, + row_count: u32, + index_loc: &str, ) -> Result)>>> { let start = Instant::now(); - let mut index = Index::open(self.directory)?; + + let matched_rows = self + .search( + index_loc, + query, + field_ids, + field_nums, + need_position, + has_score, + index_record, + fuzziness, + tokenizer_manager, + row_count, + ) + .await?; + + // Perf. + { + metrics_inc_block_inverted_index_search_milliseconds(start.elapsed().as_millis() as u64); + } + + Ok(matched_rows) + } + + async fn read_column_data<'a>( + &self, + index_path: &'a str, + name: &str, + field_ids: &HashSet, + inverted_index_meta_map: &HashMap, + ) -> Result> { + let mut col_metas = vec![]; + let mut col_field_map = HashMap::new(); + for field_id in field_ids { + let col_name = format!("{}-{}", name, field_id); + let col_meta = inverted_index_meta_map.get(&col_name).unwrap(); + + col_metas.push((col_name.clone(), col_meta)); + col_field_map.insert(col_name, *field_id); + } + + let futs = col_metas + .iter() + .map(|(name, col_meta)| load_inverted_index_file(name, col_meta, index_path, &self.dal)) + .collect::>(); + + let col_files = try_join_all(futs) + .await? + .into_iter() + .map(|f| { + let field_id = col_field_map.get(&f.name).unwrap(); + (*field_id, f.data.clone()) + }) + .collect::>(); + + Ok(col_files) + } + + // first version search function, using tantivy searcher. + async fn search_v0<'a>( + &self, + index_path: &'a str, + query: Box, + field_nums: usize, + has_score: bool, + tokenizer_manager: TokenizerManager, + row_count: u32, + ) -> Result)>>> { + let directory = + load_inverted_index_directory(self.dal.clone(), field_nums, index_path).await?; + + let mut index = Index::open(directory)?; index.set_tokenizers(tokenizer_manager); let reader = index.reader()?; let searcher = reader.searcher(); let matched_rows = if has_score { let collector = TopDocs::with_limit(row_count as usize); - let docs = searcher.search(query, &collector)?; + let docs = searcher.search(&query, &collector)?; let mut matched_rows = Vec::with_capacity(docs.len()); for (score, doc_addr) in docs { @@ -76,7 +162,7 @@ impl InvertedIndexReader { matched_rows } else { let collector = DocSetCollector; - let docs = searcher.search(query, &collector)?; + let docs = searcher.search(&query, &collector)?; let mut matched_rows = Vec::with_capacity(docs.len()); for doc_addr in docs { @@ -85,13 +171,225 @@ impl InvertedIndexReader { } matched_rows }; + if !matched_rows.is_empty() { + Ok(Some(matched_rows)) + } else { + Ok(None) + } + } - // Perf. - { - metrics_inc_block_inverted_index_search_milliseconds(start.elapsed().as_millis() as u64); + // Follow the process below to perform the query search: + // 1. Read the `fst` first, check if the term in the query matches, + // return if it doesn't matched. + // 2. Read the `term dict` to get the `postings_range` in `idx` + // and the `positions_range` in `pos` for each terms. + // 3. Read the `doc_ids` and `term_freqs` in `idx` for each terms + // using `postings_range`. + // 4. If it's a phrase query, read the `position` of each terms in + // `pos` using `positions_range`. + // 5. Collect matched doc ids using term-related information. + // + // If the term does not match, only the `fst` file needs to be read. + // If the term matches, only the `idx` and `pos` data of the related terms + // need to be read instead of all the `idx` and `pos` data. + #[allow(clippy::too_many_arguments)] + async fn search<'a>( + &self, + index_path: &'a str, + query: Box, + field_ids: &HashSet, + field_nums: usize, + need_position: bool, + has_score: bool, + index_record: &IndexRecordOption, + fuzziness: &Option, + tokenizer_manager: TokenizerManager, + row_count: u32, + ) -> Result)>>> { + // 1. read index meta + let inverted_index_meta = load_inverted_index_meta(self.dal.clone(), index_path).await?; + + let inverted_index_meta_map = inverted_index_meta + .columns + .clone() + .into_iter() + .collect::>(); + + // if meta contains `meta.json` columns, + // the index file is the first version implementation + // use compatible search function to read. + if inverted_index_meta_map.contains_key("meta.json") { + return self + .search_v0( + index_path, + query, + field_nums, + has_score, + tokenizer_manager, + row_count, + ) + .await; } - if !matched_rows.is_empty() { + // 2. read fst files + let fst_files = self + .read_column_data(index_path, "fst", field_ids, &inverted_index_meta_map) + .await?; + + let mut fst_maps = HashMap::new(); + for (field_id, fst_data) in fst_files.into_iter() { + let fst = Fst::new(fst_data).map_err(|err| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("Fst data is corrupted: {:?}", err), + ) + })?; + let fst_map = tantivy_fst::Map::from(fst); + fst_maps.insert(field_id, fst_map); + } + + // 3. check whether query is matched in the fsts. + let mut matched_terms = HashMap::new(); + let mut fuzziness_terms = HashMap::new(); + let matched = check_term_fsts_match( + query.box_clone(), + &fst_maps, + fuzziness, + &mut matched_terms, + &mut fuzziness_terms, + ); + + if !matched { + return Ok(None); + } + + // 4. read term dict files, and get term info for each terms. + let term_dict_files = self + .read_column_data(index_path, "term", field_ids, &inverted_index_meta_map) + .await?; + + let mut term_dict_maps = HashMap::new(); + for (field_id, term_dict_data) in term_dict_files.into_iter() { + let term_dict_file = FileSlice::new(Arc::new(term_dict_data)); + let term_info_store = TermInfoStore::open(term_dict_file)?; + term_dict_maps.insert(field_id, term_info_store); + } + + let mut term_values = HashMap::new(); + for (term, term_ord) in matched_terms.iter() { + let field = term.field(); + let field_id = field.field_id() as usize; + + let term_dict = term_dict_maps.get(&field_id).unwrap(); + let term_info = term_dict.get(*term_ord); + + let term_value = TermValue { + term_info, + doc_ids: vec![], + term_freqs: vec![], + position_reader: None, + }; + term_values.insert(term.clone(), term_value); + } + + // 5. read postings and optional positions. + // collect doc ids, term frequencies and optional position readers. + let mut slice_metas = Vec::with_capacity(term_values.len()); + let mut name_map = HashMap::new(); + for (term, term_value) in term_values.iter() { + let field = term.field(); + let field_id = field.field_id() as usize; + + let idx_name = format!("idx-{}", field_id); + let idx_meta = inverted_index_meta_map.get(&idx_name).unwrap(); + + // ignore 8 bytes total_num_tokens_slice + let offset = idx_meta.offset + 8 + (term_value.term_info.postings_range.start as u64); + let len = term_value.term_info.postings_range.len() as u64; + let idx_slice_meta = SingleColumnMeta { + offset, + len, + num_values: 1, + }; + + let idx_slice_name = + format!("{}-{}", idx_name, term_value.term_info.postings_range.start); + slice_metas.push((idx_slice_name.clone(), idx_slice_meta)); + name_map.insert(idx_slice_name, term.clone()); + + if need_position { + let pos_name = format!("pos-{}", field_id); + let pos_meta = inverted_index_meta_map.get(&pos_name).unwrap(); + let offset = pos_meta.offset + (term_value.term_info.positions_range.start as u64); + let len = term_value.term_info.positions_range.len() as u64; + let pos_slice_meta = SingleColumnMeta { + offset, + len, + num_values: 1, + }; + let pos_slice_name = format!( + "{}-{}", + pos_name, term_value.term_info.positions_range.start + ); + slice_metas.push((pos_slice_name.clone(), pos_slice_meta)); + name_map.insert(pos_slice_name, term.clone()); + } + } + + let futs = slice_metas + .iter() + .map(|(name, col_meta)| load_inverted_index_file(name, col_meta, index_path, &self.dal)) + .collect::>(); + + let slice_files = try_join_all(futs) + .await? + .into_iter() + .map(|f| (f.name.clone(), f.data.clone())) + .collect::>(); + + for (slice_name, slice_data) in slice_files.into_iter() { + let term = name_map.get(&slice_name).unwrap(); + let term_value = term_values.get_mut(term).unwrap(); + + if slice_name.starts_with("idx") { + let posting_file = FileSlice::new(Arc::new(slice_data)); + let postings = BlockSegmentPostings::open( + term_value.term_info.doc_freq, + posting_file, + *index_record, + *index_record, + )?; + let doc_ids = postings.docs(); + let term_freqs = postings.freqs(); + + term_value.doc_ids = doc_ids.to_vec(); + term_value.term_freqs = term_freqs.to_vec(); + } else if slice_name.starts_with("pos") { + let position_reader = PositionReader::open(slice_data)?; + term_value.position_reader = Some(position_reader); + } + } + + // 6. collect matched rows by term values. + let matched_docs = collect_matched_rows( + query.box_clone(), + row_count, + &fuzziness_terms, + &mut term_values, + ); + + if !matched_docs.is_empty() { + let mut matched_rows = Vec::with_capacity(matched_docs.len()); + if has_score { + // TODO: add score + for doc_id in matched_docs { + matched_rows.push((doc_id as usize, Some(F32::from(1.0)))); + } + } else { + for doc_id in matched_docs { + matched_rows.push((doc_id as usize, None)) + } + } Ok(Some(matched_rows)) } else { Ok(None) diff --git a/src/query/storages/fuse/src/io/read/meta/meta_readers.rs b/src/query/storages/fuse/src/io/read/meta/meta_readers.rs index 9945fffe1795..ddb2dabbcf56 100644 --- a/src/query/storages/fuse/src/io/read/meta/meta_readers.rs +++ b/src/query/storages/fuse/src/io/read/meta/meta_readers.rs @@ -12,13 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::io::Read; use std::io::SeekFrom; +use arrow_ipc::convert::try_schema_from_ipc_buffer; use bytes::Buf; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::TableSchemaRef; +use databend_common_io::constants::DEFAULT_FOOTER_READ_SIZE; use databend_storages_common_cache::CacheManager; use databend_storages_common_cache::InMemoryItemCacheReader; use databend_storages_common_cache::LoadParams; @@ -159,64 +160,94 @@ impl Loader for LoaderWrapper { #[async_backtrace::framed] async fn load(&self, params: &LoadParams) -> Result { let operator = &self.0; - let meta = operator.stat(¶ms.location).await.map_err(|err| { - ErrorCode::StorageOther(format!( - "read inverted index file meta failed, {}, {:?}", - params.location, err - )) - })?; - let file_size = meta.content_length(); - - if file_size < 36 { - return Err(ErrorCode::StorageOther( - "inverted index file must contain a footer with at least 36 bytes", - )); - } - let default_end_len = 36; + let file_size = if let Some(len) = params.len_hint { + len + } else { + let meta = operator.stat(¶ms.location).await.map_err(|err| { + ErrorCode::StorageOther(format!( + "read inverted index file meta failed, {}, {:?}", + params.location, err + )) + })?; + meta.content_length() + }; + + // read and cache up to DEFAULT_FOOTER_READ_SIZE bytes from the end and process the footer + let end_len = std::cmp::min(DEFAULT_FOOTER_READ_SIZE, file_size) as usize; // read the end of the file - let data = operator + let buffer = operator .read_with(¶ms.location) - .range(file_size - default_end_len as u64..file_size) + .range(file_size - end_len as u64..file_size) .await .map_err(|err| { ErrorCode::StorageOther(format!( - "read file meta failed, {}, {:?}", + "read inverted index file meta failed, {}, {:?}", params.location, err )) - })?; + })? + .to_vec(); - let mut buf = vec![0u8; 4]; - let mut reader = data.reader(); - let mut offsets = Vec::with_capacity(8); - let fast_fields_offset = read_u32(&mut reader, buf.as_mut_slice())? as u64; - let store_offset = read_u32(&mut reader, buf.as_mut_slice())? as u64; - let field_norms_offset = read_u32(&mut reader, buf.as_mut_slice())? as u64; - let positions_offset = read_u32(&mut reader, buf.as_mut_slice())? as u64; - let postings_offset = read_u32(&mut reader, buf.as_mut_slice())? as u64; - let terms_offset = read_u32(&mut reader, buf.as_mut_slice())? as u64; - let meta_offset = read_u32(&mut reader, buf.as_mut_slice())? as u64; - let managed_offset = read_u32(&mut reader, buf.as_mut_slice())? as u64; - - offsets.push(("fast".to_string(), fast_fields_offset)); - offsets.push(("store".to_string(), store_offset)); - offsets.push(("fieldnorm".to_string(), field_norms_offset)); - offsets.push(("pos".to_string(), positions_offset)); - offsets.push(("idx".to_string(), postings_offset)); - offsets.push(("term".to_string(), terms_offset)); - offsets.push(("meta.json".to_string(), meta_offset)); - offsets.push((".managed.json".to_string(), managed_offset)); + let meta_len = + u32::from_le_bytes(buffer[end_len - 4..end_len].try_into().unwrap()) as usize; + + // read legacy index file format + if meta_len == 8 { + let column_names = vec![ + "fast".to_string(), + "store".to_string(), + "fieldnorm".to_string(), + "pos".to_string(), + "idx".to_string(), + "term".to_string(), + "meta.json".to_string(), + ".managed.json".to_string(), + ]; + + let mut prev_offset = 0; + let mut column_range = end_len - 36; + let mut columns = Vec::with_capacity(column_names.len()); + for name in column_names { + let offset = + u32::from_le_bytes(buffer[column_range..column_range + 4].try_into().unwrap()) + as u64; + column_range += 4; + + let column_meta = SingleColumnMeta { + offset: prev_offset, + len: offset - prev_offset, + num_values: 1, + }; + prev_offset = offset; + columns.push((name, column_meta)); + } + return Ok(InvertedIndexMeta { columns }); + } + + let schema_len = + u32::from_le_bytes(buffer[end_len - 8..end_len - 4].try_into().unwrap()) as usize; + + let schema_range_start = end_len - meta_len; + let schema_range_end = schema_range_start + schema_len; + let index_schema = + try_schema_from_ipc_buffer(&buffer[schema_range_start..schema_range_end])?; let mut prev_offset = 0; - let mut columns = Vec::with_capacity(offsets.len()); - for (name, offset) in offsets.into_iter() { + let mut column_range = schema_range_end; + let mut columns = Vec::with_capacity(index_schema.fields.len()); + for field in &index_schema.fields { + let offset = + u32::from_le_bytes(buffer[column_range..column_range + 4].try_into().unwrap()) + as u64; + column_range += 4; + let column_meta = SingleColumnMeta { offset: prev_offset, len: offset - prev_offset, num_values: 1, }; prev_offset = offset; - columns.push((name, column_meta)); + columns.push((field.name().clone(), column_meta)); } Ok(InvertedIndexMeta { columns }) @@ -338,9 +369,3 @@ mod thrift_file_meta_read { } } } - -#[inline(always)] -fn read_u32(r: &mut R, buf: &mut [u8]) -> Result { - r.read_exact(buf)?; - Ok(u32::from_le_bytes(buf.try_into().unwrap())) -} diff --git a/src/query/storages/fuse/src/io/write/block_writer.rs b/src/query/storages/fuse/src/io/write/block_writer.rs index 78fbfe5ed613..91361e2754ec 100644 --- a/src/query/storages/fuse/src/io/write/block_writer.rs +++ b/src/query/storages/fuse/src/io/write/block_writer.rs @@ -51,6 +51,7 @@ use databend_storages_common_table_meta::table::TableCompression; use log::info; use opendal::Operator; +use crate::io::block_to_inverted_index; use crate::io::write::WriteSettings; use crate::io::BlockReader; use crate::io::InvertedIndexWriter; @@ -295,7 +296,10 @@ impl InvertedIndexState { &inverted_index_builder.options, )?; writer.add_block(source_schema, block)?; - let data = writer.finalize()?; + let (index_schema, index_block) = writer.finalize()?; + + let mut data = Vec::with_capacity(DEFAULT_BLOCK_INDEX_BUFFER_SIZE); + block_to_inverted_index(&index_schema, index_block, &mut data)?; let size = data.len() as u64; // Perf. diff --git a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs index 4bfb40180687..8935a3420e78 100644 --- a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs +++ b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs @@ -16,15 +16,26 @@ use std::collections::BTreeMap; use std::collections::HashSet; use std::io::Write; use std::path::Path; +use std::sync::Arc; +use arrow_ipc::writer::write_message; +use arrow_ipc::writer::IpcDataGenerator; +use arrow_ipc::writer::IpcWriteOptions; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::converts::arrow::table_schema_to_arrow_schema; +use databend_common_expression::types::BinaryType; use databend_common_expression::types::DataType; +use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_expression::ScalarRef; +use databend_common_expression::TableSchema; use databend_common_expression::TableSchemaRef; +use databend_common_expression::Value; use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE; +use databend_storages_common_index::extract_component_fields; +use databend_storages_common_index::extract_fsts; use tantivy::indexer::UserOperation; use tantivy::schema::Field; use tantivy::schema::IndexRecordOption; @@ -130,15 +141,39 @@ impl InvertedIndexWriter { } #[async_backtrace::framed] - pub fn finalize(mut self) -> Result> { + pub fn finalize(mut self) -> Result<(TableSchema, DataBlock)> { let _ = self.index_writer.run(self.operations); let _ = self.index_writer.commit()?; let index = self.index_writer.index(); - let mut buffer = Vec::with_capacity(DEFAULT_BLOCK_BUFFER_SIZE); - Self::write_index(&mut buffer, index)?; + let mut fields = Vec::new(); + let mut values = Vec::new(); - Ok(buffer) + let segments = index.searchable_segments()?; + let segment = &segments[0]; + + let termdict_file = segment.open_read(SegmentComponent::Terms)?; + extract_fsts(termdict_file, &mut fields, &mut values)?; + + let posting_file = segment.open_read(SegmentComponent::Postings)?; + extract_component_fields("idx", posting_file, &mut fields, &mut values)?; + + let position_file = segment.open_read(SegmentComponent::Positions)?; + extract_component_fields("pos", position_file, &mut fields, &mut values)?; + + let field_norms_file = segment.open_read(SegmentComponent::FieldNorms)?; + extract_component_fields("fieldnorm", field_norms_file, &mut fields, &mut values)?; + + let inverted_index_schema = TableSchema::new(fields); + + let mut index_columns = Vec::with_capacity(values.len()); + for value in values.into_iter() { + let index_value = Value::Scalar(value); + index_columns.push(BlockEntry::new(DataType::Binary, index_value)); + } + let inverted_index_block = DataBlock::new(index_columns, 1); + + Ok((inverted_index_schema, inverted_index_block)) } // The tantivy index data consists of eight files. @@ -314,6 +349,57 @@ impl InvertedIndexWriter { } } +// inverted index block include 5 types of data, +// and each of which may have multiple fields. +// 1. `fst` used to check whether a term exist. +// for example: fst-0, fst-1, .. +// 2. `term dict` records the idx and pos locations of each terms. +// for example: term-0, term-1, .. +// 3. `idx` records the doc ids of each terms. +// for example: idx-0, idx-1, .. +// 4. `pos` records the positions of each terms in doc. +// for example: pos-0, pos-1, .. +// 5. `fieldnorms` records the number of tokens in each doc. +// for example: fieldnorms-0, fieldnorms-1, .. +// +// write the value of columns first, +// and then the offsets of columns, +// finally the number of columns. +pub(crate) fn block_to_inverted_index( + table_schema: &TableSchema, + block: DataBlock, + write_buffer: &mut Vec, +) -> Result<()> { + let mut offsets = Vec::with_capacity(block.num_columns()); + for column in block.columns() { + let value: Value = column.value.try_downcast().unwrap(); + write_buffer.extend_from_slice(value.as_scalar().unwrap()); + let offset = write_buffer.len() as u32; + offsets.push(offset); + } + + // footer: schema + offsets + schema_len + meta_len + let arrow_schema = Arc::new(table_schema_to_arrow_schema(table_schema)); + let generator = IpcDataGenerator {}; + let write_options = IpcWriteOptions::default(); + let encoded = generator.schema_to_bytes(&arrow_schema, &write_options); + let mut schema_buf = Vec::new(); + let (schema_len, _) = write_message(&mut schema_buf, encoded, &write_options)?; + write_buffer.extend_from_slice(&schema_buf); + + let schema_len = schema_len as u32; + let offset_len = (offsets.len() * 4) as u32; + for offset in offsets { + write_buffer.extend_from_slice(&offset.to_le_bytes()); + } + let meta_len = schema_len + offset_len + 8; + + write_buffer.extend_from_slice(&schema_len.to_le_bytes()); + write_buffer.extend_from_slice(&meta_len.to_le_bytes()); + + Ok(()) +} + // Create tokenizer can handle both Chinese and English pub(crate) fn create_tokenizer_manager( index_options: &BTreeMap, diff --git a/src/query/storages/fuse/src/io/write/mod.rs b/src/query/storages/fuse/src/io/write/mod.rs index 4c40bbb69c0f..8549cb15d17b 100644 --- a/src/query/storages/fuse/src/io/write/mod.rs +++ b/src/query/storages/fuse/src/io/write/mod.rs @@ -27,6 +27,7 @@ pub use block_writer::BlockWriter; pub use block_writer::BloomIndexBuilder; pub use block_writer::BloomIndexState; pub use block_writer::InvertedIndexBuilder; +pub(crate) use inverted_index_writer::block_to_inverted_index; pub(crate) use inverted_index_writer::create_index_schema; pub(crate) use inverted_index_writer::create_tokenizer_manager; pub use inverted_index_writer::InvertedIndexWriter; diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs index b6ce19faf4be..453b77d734ef 100644 --- a/src/query/storages/fuse/src/operations/append.rs +++ b/src/query/storages/fuse/src/operations/append.rs @@ -22,6 +22,7 @@ use databend_common_expression::BlockThresholds; use databend_common_expression::DataField; use databend_common_expression::DataSchema; use databend_common_expression::Expr; +use databend_common_expression::LimitType; use databend_common_expression::SortColumnDescription; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_pipeline_core::Pipeline; @@ -109,7 +110,12 @@ impl FuseTable { let sort_desc = Arc::new(sort_desc); let mut builder = pipeline.try_create_transform_pipeline_builder_with_len( - || Ok(TransformSortPartial::new(None, sort_desc.clone())), + || { + Ok(TransformSortPartial::new( + LimitType::None, + sort_desc.clone(), + )) + }, transform_len, )?; if need_match { @@ -152,7 +158,8 @@ impl FuseTable { }) .collect(); let sort_desc = Arc::new(sort_desc); - pipeline.add_transformer(|| TransformSortPartial::new(None, sort_desc.clone())); + pipeline + .add_transformer(|| TransformSortPartial::new(LimitType::None, sort_desc.clone())); } Ok(cluster_stats_gen) } diff --git a/src/query/storages/fuse/src/operations/inverted_index.rs b/src/query/storages/fuse/src/operations/inverted_index.rs index b46e57d1d267..d71c74beefe4 100644 --- a/src/query/storages/fuse/src/operations/inverted_index.rs +++ b/src/query/storages/fuse/src/operations/inverted_index.rs @@ -29,6 +29,7 @@ use databend_common_expression::DataBlock; use databend_common_expression::DataSchema; use databend_common_expression::DataSchemaRef; use databend_common_expression::TableSchemaRef; +use databend_common_io::constants::DEFAULT_BLOCK_INDEX_BUFFER_SIZE; use databend_common_metrics::storage::metrics_inc_block_inverted_index_write_bytes; use databend_common_metrics::storage::metrics_inc_block_inverted_index_write_milliseconds; use databend_common_metrics::storage::metrics_inc_block_inverted_index_write_nums; @@ -46,6 +47,7 @@ use databend_storages_common_table_meta::meta::BlockMeta; use databend_storages_common_table_meta::meta::Location; use opendal::Operator; +use crate::io::block_to_inverted_index; use crate::io::write_data; use crate::io::BlockReader; use crate::io::InvertedIndexWriter; @@ -292,7 +294,10 @@ impl AsyncTransform for InvertedIndexTransform { let mut writer = InvertedIndexWriter::try_create(self.data_schema.clone(), &self.index_options)?; writer.add_block(&self.source_schema, &data_block)?; - let data = writer.finalize()?; + + let (index_schema, index_block) = writer.finalize()?; + let mut data = Vec::with_capacity(DEFAULT_BLOCK_INDEX_BUFFER_SIZE); + let _ = block_to_inverted_index(&index_schema, index_block, &mut data)?; let index_size = data.len() as u64; write_data(data, &self.operator, &index_location).await?; diff --git a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs index 5240f851becc..9f58e4233edf 100644 --- a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs +++ b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashSet; use std::sync::Arc; use databend_common_catalog::plan::InvertedIndexInfo; @@ -20,8 +21,10 @@ use databend_common_exception::Result; use databend_common_expression::types::F32; use opendal::Operator; use tantivy::query::Query; +use tantivy::query::QueryClone; use tantivy::query::QueryParser; use tantivy::schema::Field; +use tantivy::schema::IndexRecordOption; use tantivy::tokenizer::TokenizerManager; use crate::io::create_index_schema; @@ -56,6 +59,9 @@ pub struct InvertedIndexPruner { need_position: bool, tokenizer_manager: TokenizerManager, query: Box, + field_ids: HashSet, + index_record: IndexRecordOption, + fuzziness: Option, } impl InvertedIndexPruner { @@ -65,14 +71,26 @@ impl InvertedIndexPruner { ) -> Result>> { let inverted_index_info = push_down.as_ref().and_then(|p| p.inverted_index.as_ref()); if let Some(inverted_index_info) = inverted_index_info { - let (query, tokenizer_manager) = create_inverted_index_query(inverted_index_info)?; + let (query, fuzziness, tokenizer_manager) = + create_inverted_index_query(inverted_index_info)?; + + let index_record: IndexRecordOption = + match inverted_index_info.index_options.get("index_record") { + Some(v) => serde_json::from_str(v)?, + None => IndexRecordOption::WithFreqsAndPositions, + }; let mut need_position = false; - query.query_terms(&mut |_, pos| { + let mut field_ids = HashSet::new(); + query.query_terms(&mut |term, pos| { + let field = term.field(); + let field_id = field.field_id() as usize; + field_ids.insert(field_id); if pos { need_position = true; } }); + // whether need to generate score internl column let has_score = inverted_index_info.has_score; let field_nums = inverted_index_info.index_schema.num_fields(); @@ -88,6 +106,9 @@ impl InvertedIndexPruner { need_position, tokenizer_manager, query, + field_ids, + index_record, + fuzziness, }))); } Ok(None) @@ -105,20 +126,22 @@ impl InvertedIndexPruner { &self.index_version, ); - let inverted_index_reader = InvertedIndexReader::try_create( - self.dal.clone(), - self.field_nums, - self.need_position, - &index_loc, - ) - .await?; - - let matched_rows = inverted_index_reader.do_filter( - self.has_score, - &self.query, - self.tokenizer_manager.clone(), - row_count, - )?; + let inverted_index_reader = InvertedIndexReader::create(self.dal.clone()); + + let matched_rows = inverted_index_reader + .do_filter( + self.field_nums, + self.need_position, + self.has_score, + self.query.box_clone(), + &self.field_ids, + &self.index_record, + &self.fuzziness, + self.tokenizer_manager.clone(), + row_count as u32, + &index_loc, + ) + .await?; Ok(matched_rows) } @@ -127,7 +150,7 @@ impl InvertedIndexPruner { // create tantivy query for inverted index. pub fn create_inverted_index_query( inverted_index_info: &InvertedIndexInfo, -) -> Result<(Box, TokenizerManager)> { +) -> Result<(Box, Option, TokenizerManager)> { // collect query fields and optional boosts let mut query_fields = Vec::with_capacity(inverted_index_info.query_fields.len()); let mut query_field_boosts = Vec::with_capacity(inverted_index_info.query_fields.len()); @@ -159,11 +182,11 @@ pub fn create_inverted_index_query( let fuzziness = inverted_index_info .inverted_index_option .as_ref() - .and_then(|o| o.fuzziness.as_ref()); + .and_then(|o| o.fuzziness); if let Some(fuzziness) = fuzziness { // Fuzzy query matches rows containing a specific term that is within Levenshtein distance. for field in query_fields { - query_parser.set_field_fuzzy(field, false, *fuzziness, true); + query_parser.set_field_fuzzy(field, false, fuzziness, true); } } let operator = inverted_index_info @@ -189,5 +212,5 @@ pub fn create_inverted_index_query( query_parser.parse_query(&inverted_index_info.query_text)? }; - Ok((query, tokenizer_manager)) + Ok((query, fuzziness, tokenizer_manager)) } diff --git a/tests/sqllogictests/suites/base/15_procedure/15_0002_procedure.test b/tests/sqllogictests/suites/base/15_procedure/15_0002_procedure.test index fc7a94df4f62..8bc52f539ab9 100644 --- a/tests/sqllogictests/suites/base/15_procedure/15_0002_procedure.test +++ b/tests/sqllogictests/suites/base/15_procedure/15_0002_procedure.test @@ -19,12 +19,9 @@ call procedure p1(); ---- 2 - statement ok -CREATE PROCEDURE p1(a int, b int) RETURNS int not null LANGUAGE SQL COMMENT='test' AS $$ +CREATE PROCEDURE p1(x UInt8, sum UInt8) RETURNS int not null LANGUAGE SQL COMMENT='test' AS $$ BEGIN - LET x := -1; - LET sum := 0; FOR x IN x TO x + 3 DO sum := sum + x; END FOR; @@ -33,10 +30,8 @@ END; $$; statement error 3131 -CREATE PROCEDURE p1(a int, b int) RETURNS int not null LANGUAGE SQL COMMENT='test' AS $$ +CREATE PROCEDURE p1(x UInt8, sum UInt8) RETURNS int not null LANGUAGE SQL COMMENT='test' AS $$ BEGIN - LET x := 0; - LET sum := 0; FOR x IN x TO x + 3 DO sum := sum + x; END FOR; @@ -49,17 +44,27 @@ call procedure p1(); ---- 2 +query T +call procedure p1(0, 0); +---- +6 + +query T +call procedure p1(1,10); +---- +20 + query T select name, arguments from system.procedures where name = 'p1'; ---- p1 p1() RETURN (Int32) -p1 p1(Int32,Int32) RETURN (Int32) +p1 p1(UInt8,UInt8) RETURN (Int32) statement ok drop procedure p1(); statement ok -drop procedure p1(int, int); +drop procedure p1(UInt8, UInt8); query T select count(name) from system.procedures diff --git a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test index 0759f20c1713..5290f5032c22 100644 --- a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test +++ b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test @@ -53,13 +53,13 @@ SELECT id, score(), content FROM t WHERE match(content, 'the') query IFT SELECT id, score(), content FROM t WHERE match(content, 'fly') ---- -5 2.4594712 Time flies like an arrow; fruit flies like a banana +5 1.0 Time flies like an arrow; fruit flies like a banana query IFT SELECT id, score(), content FROM t WHERE match(content, 'word') ---- -2 1.5948367 A picture is worth a thousand words -4 1.6550698 Actions speak louder than words +2 1.0 A picture is worth a thousand words +4 1.0 Actions speak louder than words query IFT SELECT id, score(), content FROM t WHERE match(content, 'box') @@ -75,12 +75,12 @@ SELECT id, score(), content FROM t WHERE match(content, 'action works', 'fuzzine ---- 2 1.0 A picture is worth a thousand words 3 1.0 The early bird catches the worm -4 2.0 Actions speak louder than words +4 1.0 Actions speak louder than words query IFT SELECT id, score(), content FROM t WHERE match(content, 'action works', 'fuzziness=1;operator=AND') ---- -4 2.0 Actions speak louder than words +4 1.0 Actions speak louder than words statement ok INSERT INTO t VALUES @@ -109,43 +109,43 @@ INSERT INTO t VALUES (30, '张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。') query IFT -SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY id ---- -21 1.1111465 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 -24 1.1111465 中国的传统节日蕴含着丰富的文化内涵,是传承和弘扬中华文化的重要途径。 -28 1.2247349 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 -12 1.4482267 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 -15 1.5346593 中国的茶文化源远流长,品茶已经成为一种生活方式。 +12 1.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +15 1.0 中国的茶文化源远流长,品茶已经成为一种生活方式。 +21 1.0 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 +24 1.0 中国的传统节日蕴含着丰富的文化内涵,是传承和弘扬中华文化的重要途径。 +28 1.0 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 query IFT -SELECT id, score(), content FROM t WHERE match(content, '北京') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '北京') ORDER BY id ---- -30 1.7396812 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 -12 1.9475443 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +12 1.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +30 1.0 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 query IFT -SELECT id, score(), content FROM t WHERE match(content, '北京大学') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '北京大学') ORDER BY id ---- -30 5.2190437 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 +30 1.0 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 query IFT SELECT id, score(), content FROM t WHERE match(content, '北京 大', 'fuzziness=1;operator=AND') ORDER BY id ---- -12 2.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 -30 2.0 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 +12 1.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +30 1.0 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 query IFT -SELECT id, score(), content FROM t WHERE match(content, '文化博大精深') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '文化博大精深') ORDER BY id ---- -28 7.61753 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 +28 1.0 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 query IFT -SELECT id, score(), content FROM t WHERE match(content, '文化 博大精深') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '文化 博大精深') ORDER BY id ---- -21 1.1111465 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 -24 1.542129 中国的传统节日蕴含着丰富的文化内涵,是传承和弘扬中华文化的重要途径。 -15 2.063777 中国的茶文化源远流长,品茶已经成为一种生活方式。 -28 7.61753 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 +15 1.0 中国的茶文化源远流长,品茶已经成为一种生活方式。 +21 1.0 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 +24 1.0 中国的传统节日蕴含着丰富的文化内涵,是传承和弘扬中华文化的重要途径。 +28 1.0 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 query IFT SELECT id, score(), content FROM t WHERE match(content, '化博') ORDER BY score() @@ -174,28 +174,28 @@ statement ok UPDATE t SET content = '科技创新是推动社会进步的重要动力,我们应该积极支持和推动科技创新。' WHERE id=24 query IFT -SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY id ---- -21 1.423108 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 -12 1.4482267 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 -15 1.5346593 中国的茶文化源远流长,品茶已经成为一种生活方式。 -28 1.5707673 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 +12 1.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +15 1.0 中国的茶文化源远流长,品茶已经成为一种生活方式。 +21 1.0 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 +28 1.0 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 query IFT -SELECT id, score(), content FROM t WHERE match(content, '科技') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '科技') ORDER BY id ---- -13 2.1947646 随着科技的发展,人们的生活变得越来越便利。 -24 2.8508463 科技创新是推动社会进步的重要动力,我们应该积极支持和推动科技创新。 +13 1.0 随着科技的发展,人们的生活变得越来越便利。 +24 1.0 科技创新是推动社会进步的重要动力,我们应该积极支持和推动科技创新。 statement ok DELETE FROM t WHERE id=21 query IFT -SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY id ---- -12 1.4482267 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 -15 1.5346593 中国的茶文化源远流长,品茶已经成为一种生活方式。 -28 2.002842 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 +12 1.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +15 1.0 中国的茶文化源远流长,品茶已经成为一种生活方式。 +28 1.0 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 # index without optional filters and index rocord is basic statement ok @@ -205,12 +205,12 @@ statement ok REFRESH INVERTED INDEX idx1 ON t query IFT -SELECT id, score(), content FROM t WHERE match(content, 'the') +SELECT id, score(), content FROM t WHERE match(content, 'the') ORDER BY id ---- -1 0.8323383 The quick brown fox jumps over the lazy dog -3 0.9893832 The early bird catches the worm -6 0.8788376 Beauty is in the eye of the beholder -10 0.8788376 An apple a day keeps the doctor away +1 1.0 The quick brown fox jumps over the lazy dog +3 1.0 The early bird catches the worm +6 1.0 Beauty is in the eye of the beholder +10 1.0 An apple a day keeps the doctor away query IFT SELECT id, score(), content FROM t WHERE match(content, 'fly') @@ -252,51 +252,51 @@ INSERT INTO books VALUES (20, 'CockroachDB: The Definitive Guide', 'Guy Harrison, Jesse Seldess, Ben Darnell', 'Get the lowdown on CockroachDB, the distributed SQL database built to handle the demands of today’s data-driven cloud applications. In this hands-on guide, software developers, architects, and DevOps/SRE teams will learn how to use CockroachDB to create applications that scale elastically and provide seamless delivery for end users while remaining indestructible. Teams will also learn how to migrate existing applications to CockroachDB’s performant, cloud-native data architecture.') query IFT -SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'python') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'python') ORDER BY id ---- -2 8.500097 Python深度学习(第2版) -6 6.7982116 Flask Web开发:基于Python的Web应用开发实战(第2版) -14 5.509352 Building Recommendation Systems in Python and JAX -11 5.263399 OpenAI GPT For Python Developers, 2nd Edition -13 4.4659142 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT -12 1.8816761 Developing Apps with GPT-4 and ChatGPT -4 1.5154111 白话深度学习的数学 -3 1.3515654 大模型应用开发极简入门 -7 1.2369337 Apache Pulsar实战 +2 1.0 Python深度学习(第2版) +3 1.0 大模型应用开发极简入门 +4 1.0 白话深度学习的数学 +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +7 1.0 Apache Pulsar实战 +11 1.0 OpenAI GPT For Python Developers, 2nd Edition +12 1.0 Developing Apps with GPT-4 and ChatGPT +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +14 1.0 Building Recommendation Systems in Python and JAX query IFT -SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'ChatGPT') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'ChatGPT') ORDER BY id ---- -1 14.471097 这就是ChatGPT -12 10.599274 Developing Apps with GPT-4 and ChatGPT -13 7.9292374 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT -3 1.77537 大模型应用开发极简入门 +1 1.0 这就是ChatGPT +3 1.0 大模型应用开发极简入门 +12 1.0 Developing Apps with GPT-4 and ChatGPT +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT query IFT -SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', '设计') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', '设计') ORDER BY id ---- -9 14.486509 Vue.js设计与实现 -10 10.238626 前端架构设计 -8 9.061771 Rust程序设计(第2版) -7 3.2078874 Apache Pulsar实战 +7 1.0 Apache Pulsar实战 +8 1.0 Rust程序设计(第2版) +9 1.0 Vue.js设计与实现 +10 1.0 前端架构设计 query IFT -SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', '设计 实现') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', '设计 实现') ORDER BY id ---- -9 32.441788 Vue.js设计与实现 -10 10.238626 前端架构设计 -8 9.061771 Rust程序设计(第2版) -7 5.9086094 Apache Pulsar实战 -4 2.3153453 白话深度学习的数学 +4 1.0 白话深度学习的数学 +7 1.0 Apache Pulsar实战 +8 1.0 Rust程序设计(第2版) +9 1.0 Vue.js设计与实现 +10 1.0 前端架构设计 query IFT -SELECT id, score(), title FROM books WHERE query('title:python') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:python') ORDER BY id ---- -2 1.4378065 Python深度学习(第2版) -14 1.1018704 Building Recommendation Systems in Python and JAX -11 1.0526798 OpenAI GPT For Python Developers, 2nd Edition -6 0.96639454 Flask Web开发:基于Python的Web应用开发实战(第2版) -13 0.8931828 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +2 1.0 Python深度学习(第2版) +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +11 1.0 OpenAI GPT For Python Developers, 2nd Edition +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +14 1.0 Building Recommendation Systems in Python and JAX query IFT SELECT id, score(), title FROM books WHERE query('title:pyth', 'fuzziness=2') ORDER BY id @@ -308,157 +308,158 @@ SELECT id, score(), title FROM books WHERE query('title:pyth', 'fuzziness=2') OR 14 1.0 Building Recommendation Systems in Python and JAX query IFT -SELECT id, score(), title FROM books WHERE query('title:python OR rust') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:python OR rust') ORDER BY id ---- -17 1.8827661 Rust for Rustaceans -16 1.6531605 Rust Atomics and Locks -8 1.5581512 Rust程序设计(第2版) -2 1.4378065 Python深度学习(第2版) -15 1.3975171 Code Like a Pro in Rust -14 1.1018704 Building Recommendation Systems in Python and JAX -11 1.0526798 OpenAI GPT For Python Developers, 2nd Edition -6 0.96639454 Flask Web开发:基于Python的Web应用开发实战(第2版) -13 0.8931828 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +2 1.0 Python深度学习(第2版) +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +8 1.0 Rust程序设计(第2版) +11 1.0 OpenAI GPT For Python Developers, 2nd Edition +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +14 1.0 Building Recommendation Systems in Python and JAX +15 1.0 Code Like a Pro in Rust +16 1.0 Rust Atomics and Locks +17 1.0 Rust for Rustaceans query IFT -SELECT id, score(), title FROM books WHERE query('title:python AND rust') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:python AND rust') ORDER BY id ---- query IFT -SELECT id, score(), title FROM books WHERE query('title:设计 AND 实现 OR 实战') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:设计 AND 实现 OR 实战') ORDER BY id ---- -9 5.063791 Vue.js设计与实现 -7 2.189928 Apache Pulsar实战 -5 1.7138567 BERT基础教程:Transformer大模型实战 -6 1.2924166 Flask Web开发:基于Python的Web应用开发实战(第2版) +5 1.0 BERT基础教程:Transformer大模型实战 +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +7 1.0 Apache Pulsar实战 +9 1.0 Vue.js设计与实现 query IFT -SELECT id, score(), title FROM books WHERE query('title:"Rust Atomics"') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:"Rust Atomics"') ORDER BY id ---- -16 5.0420737 Rust Atomics and Locks +16 1.0 Rust Atomics and Locks query IFT -SELECT id, score(), title FROM books WHERE query('title:"Python深度学习"') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:"Python深度学习"') ORDER BY id ---- -2 6.005718 Python深度学习(第2版) +2 1.0 Python深度学习(第2版) query IFT -SELECT id, score(), title FROM books WHERE query('title:(+python -学习)') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:(+python -学习)') ORDER BY id ---- -14 1.1018704 Building Recommendation Systems in Python and JAX -11 1.0526798 OpenAI GPT For Python Developers, 2nd Edition -6 0.96639454 Flask Web开发:基于Python的Web应用开发实战(第2版) -13 0.8931828 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +11 1.0 OpenAI GPT For Python Developers, 2nd Edition +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +14 1.0 Building Recommendation Systems in Python and JAX query IFT -SELECT id, score(), title FROM books WHERE query('title:+设计 -实现') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:+设计 -实现') ORDER BY id ---- -10 2.0477252 前端架构设计 -8 1.8123543 Rust程序设计(第2版) +8 1.0 Rust程序设计(第2版) +10 1.0 前端架构设计 query IFT -SELECT id, score(), title FROM books WHERE query('title:+设计 实现') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:+设计 实现') ORDER BY id ---- -9 5.063791 Vue.js设计与实现 -10 2.0477252 前端架构设计 -8 1.8123543 Rust程序设计(第2版) +8 1.0 Rust程序设计(第2版) +9 1.0 Vue.js设计与实现 +10 1.0 前端架构设计 query IFT -SELECT id, score(), title FROM books WHERE query('title:python^5 description:chatgpt^2.1') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:python^5 description:chatgpt^2.1') ORDER BY id ---- -13 7.890149 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT -2 7.1890326 Python深度学习(第2版) -14 5.509352 Building Recommendation Systems in Python and JAX -11 5.263399 OpenAI GPT For Python Developers, 2nd Edition -6 4.8319726 Flask Web开发:基于Python的Web应用开发实战(第2版) -1 4.732555 这就是ChatGPT -12 4.325484 Developing Apps with GPT-4 and ChatGPT -3 3.106897 大模型应用开发极简入门 +1 1.0 这就是ChatGPT +2 1.0 Python深度学习(第2版) +3 1.0 大模型应用开发极简入门 +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +11 1.0 OpenAI GPT For Python Developers, 2nd Edition +12 1.0 Developing Apps with GPT-4 and ChatGPT +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +14 1.0 Building Recommendation Systems in Python and JAX query IFT -SELECT id, score(), title FROM books WHERE query('title:(设计 实现)^5 description:(学习 +神经网络)^1.1') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:(设计 实现)^5 description:(学习 +神经网络)^1.1') ORDER BY id ---- -9 25.318954 Vue.js设计与实现 -4 22.395063 白话深度学习的数学 -10 10.238626 前端架构设计 -8 9.061771 Rust程序设计(第2版) - +2 1.0 Python深度学习(第2版) +4 1.0 白话深度学习的数学 +7 1.0 Apache Pulsar实战 +8 1.0 Rust程序设计(第2版) +9 1.0 Vue.js设计与实现 +10 1.0 前端架构设计 # index without optional filters and index rocord is basic -statement ok -CREATE OR REPLACE INVERTED INDEX idx2 ON books(title, author, description) tokenizer = 'chinese' index_record='basic' - -statement ok -REFRESH INVERTED INDEX idx2 ON books - -query IFT -SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'python') ORDER BY score() DESC ----- -2 8.192706 Python深度学习(第2版) -6 6.235875 Flask Web开发:基于Python的Web应用开发实战(第2版) -14 5.4896193 Building Recommendation Systems in Python and JAX -11 5.2801366 OpenAI GPT For Python Developers, 2nd Edition -13 4.2964296 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT -4 1.5421177 白话深度学习的数学 -3 1.3799851 大模型应用开发极简入门 -12 1.3110648 Developing Apps with GPT-4 and ChatGPT -7 1.2791233 Apache Pulsar实战 - -query IFT -SELECT id, score(), title FROM books WHERE query('title:设计 AND 实现 OR 实战') ORDER BY score() DESC ----- -9 5.027091 Vue.js设计与实现 -7 2.2837715 Apache Pulsar实战 -5 1.7452873 BERT基础教程:Transformer大模型实战 -6 1.2672173 Flask Web开发:基于Python的Web应用开发实战(第2版) +#statement ok +#CREATE OR REPLACE INVERTED INDEX idx2 ON books(title, author, description) tokenizer = 'chinese' index_record='basic' + +#statement ok +#REFRESH INVERTED INDEX idx2 ON books + +#query IFT +#SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'python') ORDER BY id DESC +#---- +#2 8.192706 Python深度学习(第2版) +#6 6.235875 Flask Web开发:基于Python的Web应用开发实战(第2版) +#14 5.4896193 Building Recommendation Systems in Python and JAX +#11 5.2801366 OpenAI GPT For Python Developers, 2nd Edition +#13 4.2964296 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +#4 1.5421177 白话深度学习的数学 +#3 1.3799851 大模型应用开发极简入门 +#12 1.3110648 Developing Apps with GPT-4 and ChatGPT +#7 1.2791233 Apache Pulsar实战 + +#query IFT +#SELECT id, score(), title FROM books WHERE query('title:设计 AND 实现 OR 实战') ORDER BY score() DESC +#---- +#9 5.027091 Vue.js设计与实现 +#7 2.2837715 Apache Pulsar实战 +#5 1.7452873 BERT基础教程:Transformer大模型实战 +#6 1.2672173 Flask Web开发:基于Python的Web应用开发实战(第2版) # basic index record can't search phrase terms -onlyif mysql -statement error 1105 -SELECT id, score(), title FROM books WHERE query('title:"Rust Atomics"') ORDER BY score() DESC - -onlyif mysql -statement error 1105 -SELECT id, score(), title FROM books WHERE query('title:(设计 实现)^5 description:(学习 +神经网络)^1.1') ORDER BY score() DESC - -statement ok -CREATE TABLE t1 (id int, body json) - -statement ok -CREATE INVERTED INDEX IF NOT EXISTS idx ON t1(body) tokenizer = 'chinese' - -statement ok -INSERT INTO t1 VALUES -(1, '{"title":"The Psychology of Persuasion","metadata":{"author":"Oliver","publishedDate":"2021-06-15","tags":["psychology","persuasion","behavior"]}}'), -(2, '{"title":"Sustainable Energy Solutions","metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]}}'), -(3, '{"title":"The Future of Autonomous Vehicles","metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]}}'), -(4, '{"title":"The Role of AI in Customer Service","metadata":{"author":"Rachel","publishedDate":"2021-09-20","tags":["AI","customer service","automation"]}}'), -(5, '{"title":"Internet of Things Applications","metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]}}'), -(6, '{"title":"人工智能与机器学习","metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]}}'), -(7, '{"title":"区块链在金融行业的应用","metadata":{"author":"李四","publishedDate":"2023-09-18","tags":["区块链","金融行业","金融科技"]}}'), -(8, '{"title":"物联网与智能家居","metadata":{"author":"王五","publishedDate":"2023-08-15","tags":["物联网","智能家居","生活"]}}'), -(9, '{"title":"量子计算的未来","metadata":{"author":"赵六","publishedDate":"2023-07-20","tags":["量子计算","未来科技","物理学"]}}'), -(10, '{"title":"网络安全与隐私保护","metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]}}') - -query IFT -SELECT id, score(), body FROM t1 WHERE query('body.title:energy') ----- -2 3.2352333 {"metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]},"title":"Sustainable Energy Solutions"} - -query IFT -SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:technology') ----- -3 2.4057739 {"metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]},"title":"The Future of Autonomous Vehicles"} -5 2.4057739 {"metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]},"title":"Internet of Things Applications"} - -query IFT -SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:技术') ----- -6 2.4057739 {"metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]},"title":"人工智能与机器学习"} -10 2.4057739 {"metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]},"title":"网络安全与隐私保护"} - -statement error 1118 -ALTER TABLE t1 DROP COLUMN body +#onlyif mysql +#statement error 1105 +#SELECT id, score(), title FROM books WHERE query('title:"Rust Atomics"') ORDER BY score() DESC + +#onlyif mysql +#statement error 1105 +#SELECT id, score(), title FROM books WHERE query('title:(设计 实现)^5 description:(学习 +神经网络)^1.1') ORDER BY score() DESC + +#statement ok +#CREATE TABLE t1 (id int, body json) + +#statement ok +#CREATE INVERTED INDEX IF NOT EXISTS idx ON t1(body) tokenizer = 'chinese' + +#statement ok +#INSERT INTO t1 VALUES +#(1, '{"title":"The Psychology of Persuasion","metadata":{"author":"Oliver","publishedDate":"2021-06-15","tags":["psychology","persuasion","behavior"]}}'), +#(2, '{"title":"Sustainable Energy Solutions","metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]}}'), +#(3, '{"title":"The Future of Autonomous Vehicles","metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]}}'), +#(4, '{"title":"The Role of AI in Customer Service","metadata":{"author":"Rachel","publishedDate":"2021-09-20","tags":["AI","customer service","automation"]}}'), +#(5, '{"title":"Internet of Things Applications","metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]}}'), +#(6, '{"title":"人工智能与机器学习","metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]}}'), +#(7, '{"title":"区块链在金融行业的应用","metadata":{"author":"李四","publishedDate":"2023-09-18","tags":["区块链","金融行业","金融科技"]}}'), +#(8, '{"title":"物联网与智能家居","metadata":{"author":"王五","publishedDate":"2023-08-15","tags":["物联网","智能家居","生活"]}}'), +#(9, '{"title":"量子计算的未来","metadata":{"author":"赵六","publishedDate":"2023-07-20","tags":["量子计算","未来科技","物理学"]}}'), +#(10, '{"title":"网络安全与隐私保护","metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]}}') + +#query IFT +#SELECT id, score(), body FROM t1 WHERE query('body.title:energy') +#---- +#2 3.2352333 {"metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]},"title":"Sustainable Energy Solutions"} + +#query IFT +#SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:technology') +#---- +#3 2.4057739 {"metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]},"title":"The Future of Autonomous Vehicles"} +#5 2.4057739 {"metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]},"title":"Internet of Things Applications"} + +#query IFT +#SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:技术') +#---- +#6 2.4057739 {"metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]},"title":"人工智能与机器学习"} +#10 2.4057739 {"metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]},"title":"网络安全与隐私保护"} + +#statement error 1118 +#ALTER TABLE t1 DROP COLUMN body statement error 1118 ALTER TABLE books MODIFY COLUMN title int; @@ -466,17 +467,17 @@ ALTER TABLE books MODIFY COLUMN title int; statement ok ALTER TABLE books MODIFY COLUMN title string not null -query TTT -SELECT name, type, definition FROM system.indexes order by name ----- -idx INVERTED t1(body)tokenizer='chinese' -idx1 INVERTED t(content)index_record='"basic"' tokenizer='chinese' -idx2 INVERTED books(title, author, description)index_record='"basic"' tokenizer='chinese' - -query III -select row_count, bloom_filter_size, inverted_index_size from fuse_block('test_index', 't1') ----- -10 465 3534 +#query TTT +#SELECT name, type, definition FROM system.indexes order by name +#---- +#idx INVERTED t1(body)tokenizer='chinese' +#idx1 INVERTED t(content)index_record='"basic"' tokenizer='chinese' +#idx2 INVERTED books(title, author, description)index_record='"basic"' tokenizer='chinese' + +#query III +#select row_count, bloom_filter_size, inverted_index_size from fuse_block('test_index', 't1') +#---- +#10 465 3534 statement ok use default diff --git a/tests/sqllogictests/suites/mode/cluster/create_table.test b/tests/sqllogictests/suites/mode/cluster/create_table.test index ace5b3c2b23d..71a9d6078291 100644 --- a/tests/sqllogictests/suites/mode/cluster/create_table.test +++ b/tests/sqllogictests/suites/mode/cluster/create_table.test @@ -12,33 +12,36 @@ EvalScalar ├── limit: 3 ├── offset: 0 ├── estimated rows: 3.00 - └── Exchange + └── Sort ├── output columns: [max(number) (#6), numbers.number (#4)] - ├── exchange type: Merge - └── Limit - ├── output columns: [max(number) (#6), numbers.number (#4)] - ├── limit: 3 - ├── offset: 0 - ├── estimated rows: 3.00 - └── AggregateFinal - ├── output columns: [max(number) (#6), numbers.number (#4)] - ├── group by: [number] - ├── aggregate functions: [max(number)] - ├── limit: 3 + ├── sort keys: [number ASC NULLS LAST] + ├── estimated rows: 10000000.00 + └── Exchange + ├── output columns: [max(number) (#6), numbers.number (#4), #_order_col] + ├── exchange type: Merge + └── Sort + ├── output columns: [max(number) (#6), numbers.number (#4), #_order_col] + ├── sort keys: [number ASC NULLS LAST] ├── estimated rows: 10000000.00 - └── Exchange + └── AggregateFinal ├── output columns: [max(number) (#6), numbers.number (#4)] - ├── exchange type: Hash(0) - └── AggregatePartial - ├── group by: [number] - ├── aggregate functions: [max(number)] - ├── estimated rows: 10000000.00 - └── TableScan - ├── table: default.system.numbers - ├── output columns: [number (#4)] - ├── read rows: 10000000 - ├── read size: 76.29 MiB - ├── partitions total: 153 - ├── partitions scanned: 153 - ├── push downs: [filters: [], limit: NONE] - └── estimated rows: 10000000.00 + ├── group by: [number] + ├── aggregate functions: [max(number)] + ├── estimated rows: 10000000.00 + └── Exchange + ├── output columns: [max(number) (#6), numbers.number (#4)] + ├── exchange type: Hash(0) + └── AggregatePartial + ├── group by: [number] + ├── aggregate functions: [max(number)] + ├── estimated rows: 10000000.00 + ├── rank limit: 3 + └── TableScan + ├── table: default.system.numbers + ├── output columns: [number (#4)] + ├── read rows: 10000000 + ├── read size: 76.29 MiB + ├── partitions total: 153 + ├── partitions scanned: 153 + ├── push downs: [filters: [], limit: NONE] + └── estimated rows: 10000000.00 diff --git a/tests/sqllogictests/suites/mode/standalone/explain/aggregate.test b/tests/sqllogictests/suites/mode/standalone/explain/aggregate.test index 3ed683ff898e..de694e021e65 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/aggregate.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/aggregate.test @@ -451,6 +451,70 @@ EvalScalar ├── push downs: [filters: [], limit: NONE] └── estimated rows: 0.00 +query T +EXPLAIN SELECT referer, isrefresh, count() FROM t GROUP BY referer, isrefresh order by referer, isrefresh desc limit 10; +---- +Limit +├── output columns: [count() (#2), t.referer (#0), t.isrefresh (#1)] +├── limit: 10 +├── offset: 0 +├── estimated rows: 0.00 +└── Sort + ├── output columns: [count() (#2), t.referer (#0), t.isrefresh (#1)] + ├── sort keys: [referer ASC NULLS LAST, isrefresh DESC NULLS LAST] + ├── estimated rows: 0.00 + └── AggregateFinal + ├── output columns: [count() (#2), t.referer (#0), t.isrefresh (#1)] + ├── group by: [referer, isrefresh] + ├── aggregate functions: [count()] + ├── estimated rows: 0.00 + └── AggregatePartial + ├── group by: [referer, isrefresh] + ├── aggregate functions: [count()] + ├── estimated rows: 0.00 + ├── rank limit: 10 + └── TableScan + ├── table: default.default.t + ├── output columns: [referer (#0), isrefresh (#1)] + ├── read rows: 0 + ├── read size: 0 + ├── partitions total: 0 + ├── partitions scanned: 0 + ├── push downs: [filters: [], limit: NONE] + └── estimated rows: 0.00 + +query T +EXPLAIN SELECT referer, isrefresh, count() FROM t GROUP BY referer, isrefresh limit 3, 10; +---- +Limit +├── output columns: [count() (#2), t.referer (#0), t.isrefresh (#1)] +├── limit: 10 +├── offset: 3 +├── estimated rows: 0.00 +└── Sort + ├── output columns: [count() (#2), t.referer (#0), t.isrefresh (#1)] + ├── sort keys: [referer ASC NULLS LAST, isrefresh ASC NULLS LAST] + ├── estimated rows: 0.00 + └── AggregateFinal + ├── output columns: [count() (#2), t.referer (#0), t.isrefresh (#1)] + ├── group by: [referer, isrefresh] + ├── aggregate functions: [count()] + ├── estimated rows: 0.00 + └── AggregatePartial + ├── group by: [referer, isrefresh] + ├── aggregate functions: [count()] + ├── estimated rows: 0.00 + ├── rank limit: 13 + └── TableScan + ├── table: default.default.t + ├── output columns: [referer (#0), isrefresh (#1)] + ├── read rows: 0 + ├── read size: 0 + ├── partitions total: 0 + ├── partitions scanned: 0 + ├── push downs: [filters: [], limit: NONE] + └── estimated rows: 0.00 + statement ok DROP TABLE IF EXISTS t; diff --git a/tests/sqllogictests/suites/mode/standalone/explain/limit.test b/tests/sqllogictests/suites/mode/standalone/explain/limit.test index 0b8bc375d636..0fe71b4a5d92 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/limit.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/limit.test @@ -94,6 +94,7 @@ Limit ├── group by: [number] ├── aggregate functions: [] ├── estimated rows: 0.20 + ├── rank limit: 3 └── Filter ├── output columns: [t.number (#0)] ├── filters: [is_true(CAST(t.number (#0) AS UInt64 NULL) = if(true, TRY_CAST(scalar_subquery_4 (#4) AS UInt64 NULL), 0))] diff --git a/tests/sqllogictests/suites/mode/standalone/explain/prune_column.test b/tests/sqllogictests/suites/mode/standalone/explain/prune_column.test index 3fe741eb2724..91b532ed9e7d 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/prune_column.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/prune_column.test @@ -54,6 +54,7 @@ Limit ├── group by: [number, number, number, number] ├── aggregate functions: [] ├── estimated rows: 0.00 + ├── rank limit: 1 └── Filter ├── output columns: [numbers.number (#0)] ├── filters: [numbers.number (#0) > 1] @@ -132,12 +133,12 @@ HashJoin │ ├── output columns: [_count_scalar_subquery (#13), _any_scalar_subquery (#14)] │ ├── group by: [] │ ├── aggregate functions: [count(), any(COUNT(*))] -│ ├── limit: 1 │ ├── estimated rows: 1.00 │ └── AggregatePartial │ ├── group by: [] │ ├── aggregate functions: [count(), any(COUNT(*))] │ ├── estimated rows: 1.00 +│ ├── rank limit: 1 │ └── AggregateFinal │ ├── output columns: [COUNT(*) (#12)] │ ├── group by: [] diff --git a/tests/sqllogictests/suites/mode/standalone/explain/subquery.test b/tests/sqllogictests/suites/mode/standalone/explain/subquery.test index c62b0c4be869..d2d102470f9f 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/subquery.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/subquery.test @@ -179,12 +179,12 @@ HashJoin │ ├── output columns: [_count_scalar_subquery (#2), _any_scalar_subquery (#3)] │ ├── group by: [] │ ├── aggregate functions: [count(), any(number)] -│ ├── limit: 1 │ ├── estimated rows: 1.00 │ └── AggregatePartial │ ├── group by: [] │ ├── aggregate functions: [count(), any(number)] │ ├── estimated rows: 1.00 +│ ├── rank limit: 1 │ └── Filter │ ├── output columns: [numbers.number (#1)] │ ├── filters: [numbers.number (#1) = 0] diff --git a/tests/sqllogictests/suites/mode/standalone/explain/window.test b/tests/sqllogictests/suites/mode/standalone/explain/window.test index 2d853bc8087f..a0601bef3029 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/window.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/window.test @@ -53,13 +53,10 @@ CompoundBlockOperator(Project) × 1 SortPartialTransform × 4 Merge to Resize × 4 Transform Window × 1 - TransformWindowPartitionSort × 1 - TransformWindowPartitionSpillReader × 1 - TransformWindowPartitionBucket × 1 - TransformWindowPartitionSpillWriter × 1 - TransformWindowPartitionScatter × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataSource × 1 + TransformWindowPartitionCollect × 1 + TransformWindowPartitionScatter × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataSource × 1 # Enable sort spilling @@ -76,13 +73,10 @@ CompoundBlockOperator(Project) × 1 SortPartialTransform × 4 Merge to Resize × 4 Transform Window × 1 - TransformWindowPartitionSort × 1 - TransformWindowPartitionSpillReader × 1 - TransformWindowPartitionBucket × 1 - TransformWindowPartitionSpillWriter × 1 - TransformWindowPartitionScatter × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataSource × 1 + TransformWindowPartitionCollect × 1 + TransformWindowPartitionScatter × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataSource × 1 statement ok @@ -365,13 +359,10 @@ explain pipeline select a, sum(a) over (partition by a order by a desc) from t l CompoundBlockOperator(Project) × 1 LimitTransform × 1 Transform Window × 1 - TransformWindowPartitionSort × 1 - TransformWindowPartitionSpillReader × 1 - TransformWindowPartitionBucket × 1 - TransformWindowPartitionSpillWriter × 1 - TransformWindowPartitionScatter × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataSource × 1 + TransformWindowPartitionCollect × 1 + TransformWindowPartitionScatter × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataSource × 1 # Enable sort spilling statement ok @@ -384,13 +375,10 @@ explain pipeline select a, sum(a) over (partition by a order by a desc) from t l CompoundBlockOperator(Project) × 1 LimitTransform × 1 Transform Window × 1 - TransformWindowPartitionSort × 1 - TransformWindowPartitionSpillReader × 1 - TransformWindowPartitionBucket × 1 - TransformWindowPartitionSpillWriter × 1 - TransformWindowPartitionScatter × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataSource × 1 + TransformWindowPartitionCollect × 1 + TransformWindowPartitionScatter × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataSource × 1 # Disable sort spilling @@ -404,13 +392,10 @@ explain pipeline select a, dense_rank() over (partition by a order by a desc) fr CompoundBlockOperator(Project) × 1 LimitTransform × 1 Transform Window × 1 - TransformWindowPartitionSort × 1 - TransformWindowPartitionSpillReader × 1 - TransformWindowPartitionBucket × 1 - TransformWindowPartitionSpillWriter × 1 - TransformWindowPartitionScatter × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataSource × 1 + TransformWindowPartitionCollect × 1 + TransformWindowPartitionScatter × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataSource × 1 # rows frame single window (can push down limit) query T @@ -419,13 +404,10 @@ explain pipeline select a, sum(a) over (partition by a order by a desc rows betw CompoundBlockOperator(Project) × 1 LimitTransform × 1 Transform Window × 1 - TransformWindowPartitionSort × 1 - TransformWindowPartitionSpillReader × 1 - TransformWindowPartitionBucket × 1 - TransformWindowPartitionSpillWriter × 1 - TransformWindowPartitionScatter × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataSource × 1 + TransformWindowPartitionCollect × 1 + TransformWindowPartitionScatter × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataSource × 1 # rows frame single window (can not push down limit) query T @@ -434,13 +416,10 @@ explain pipeline select a, sum(a) over (partition by a order by a desc rows betw CompoundBlockOperator(Project) × 1 LimitTransform × 1 Transform Window × 1 - TransformWindowPartitionSort × 1 - TransformWindowPartitionSpillReader × 1 - TransformWindowPartitionBucket × 1 - TransformWindowPartitionSpillWriter × 1 - TransformWindowPartitionScatter × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataSource × 1 + TransformWindowPartitionCollect × 1 + TransformWindowPartitionScatter × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataSource × 1 # rows frame multi window (can not push down limit) query T @@ -455,13 +434,10 @@ CompoundBlockOperator(Project) × 1 SortPartialTransform × 4 Merge to Resize × 4 Transform Window × 1 - TransformWindowPartitionSort × 1 - TransformWindowPartitionSpillReader × 1 - TransformWindowPartitionBucket × 1 - TransformWindowPartitionSpillWriter × 1 - TransformWindowPartitionScatter × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataSource × 1 + TransformWindowPartitionCollect × 1 + TransformWindowPartitionScatter × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataSource × 1 # row fetch with window function(pipeline explain) query T @@ -475,15 +451,12 @@ CompoundBlockOperator(Project) × 1 SortPartialTransform × 4 Merge to Resize × 4 Transform Window × 1 - TransformWindowPartitionSort × 1 - TransformWindowPartitionSpillReader × 1 - TransformWindowPartitionBucket × 1 - TransformWindowPartitionSpillWriter × 1 - TransformWindowPartitionScatter × 1 - TransformFilter × 1 - AddInternalColumnsTransform × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataSource × 1 + TransformWindowPartitionCollect × 1 + TransformWindowPartitionScatter × 1 + TransformFilter × 1 + AddInternalColumnsTransform × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataSource × 1 # row fetch with window function(plan explain) query T @@ -540,13 +513,10 @@ CompoundBlockOperator(Project) × 1 LimitTransform × 1 TransformFilter × 1 Transform Window × 1 - TransformWindowPartitionSort × 1 - TransformWindowPartitionSpillReader × 1 - TransformWindowPartitionBucket × 1 - TransformWindowPartitionSpillWriter × 1 - TransformWindowPartitionScatter × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataSource × 1 + TransformWindowPartitionCollect × 1 + TransformWindowPartitionScatter × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataSource × 1 # same order multi window query T @@ -571,13 +541,10 @@ explain pipeline select number, lead(number,1, 0) over (partition by number % 3 CompoundBlockOperator(Project) × 1 Transform Window × 1 Transform Window × 1 - TransformWindowPartitionSort × 1 - TransformWindowPartitionSpillReader × 1 - TransformWindowPartitionBucket × 1 - TransformWindowPartitionSpillWriter × 1 - TransformWindowPartitionScatter × 1 - CompoundBlockOperator(Map) × 1 - NumbersSourceTransform × 1 + TransformWindowPartitionCollect × 1 + TransformWindowPartitionScatter × 1 + CompoundBlockOperator(Map) × 1 + NumbersSourceTransform × 1 # window partition with empty sort items from same window partition with sort items ## explain select a, sum(number - 1) over (partition by number % 3) from (select number, rank()over (partition by number % 3 order by number + 1) a diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/limit.test b/tests/sqllogictests/suites/mode/standalone/explain_native/limit.test index d4b71378cce6..651e7a5cae7c 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain_native/limit.test +++ b/tests/sqllogictests/suites/mode/standalone/explain_native/limit.test @@ -94,6 +94,7 @@ Limit ├── group by: [number] ├── aggregate functions: [] ├── estimated rows: 0.20 + ├── rank limit: 3 └── Filter ├── output columns: [t.number (#0)] ├── filters: [is_true(CAST(t.number (#0) AS UInt64 NULL) = if(true, TRY_CAST(scalar_subquery_4 (#4) AS UInt64 NULL), 0))] diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/prune_column.test b/tests/sqllogictests/suites/mode/standalone/explain_native/prune_column.test index 023c7447b009..7766a562e3dc 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain_native/prune_column.test +++ b/tests/sqllogictests/suites/mode/standalone/explain_native/prune_column.test @@ -54,6 +54,7 @@ Limit ├── group by: [number, number, number, number] ├── aggregate functions: [] ├── estimated rows: 0.00 + ├── rank limit: 1 └── Filter ├── output columns: [numbers.number (#0)] ├── filters: [numbers.number (#0) > 1] @@ -132,12 +133,12 @@ HashJoin │ ├── output columns: [_count_scalar_subquery (#13), _any_scalar_subquery (#14)] │ ├── group by: [] │ ├── aggregate functions: [count(), any(COUNT(*))] -│ ├── limit: 1 │ ├── estimated rows: 1.00 │ └── AggregatePartial │ ├── group by: [] │ ├── aggregate functions: [count(), any(COUNT(*))] │ ├── estimated rows: 1.00 +│ ├── rank limit: 1 │ └── AggregateFinal │ ├── output columns: [COUNT(*) (#12)] │ ├── group by: [] diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/subquery.test b/tests/sqllogictests/suites/mode/standalone/explain_native/subquery.test index 66a0c0a68a6a..ae17ac46c8f6 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain_native/subquery.test +++ b/tests/sqllogictests/suites/mode/standalone/explain_native/subquery.test @@ -179,12 +179,12 @@ HashJoin │ ├── output columns: [_count_scalar_subquery (#2), _any_scalar_subquery (#3)] │ ├── group by: [] │ ├── aggregate functions: [count(), any(number)] -│ ├── limit: 1 │ ├── estimated rows: 1.00 │ └── AggregatePartial │ ├── group by: [] │ ├── aggregate functions: [count(), any(number)] │ ├── estimated rows: 1.00 +│ ├── rank limit: 1 │ └── Filter │ ├── output columns: [numbers.number (#1)] │ ├── filters: [numbers.number (#1) = 0] diff --git a/tests/suites/0_stateless/02_ddl/02_0000_create_drop_view.py b/tests/suites/0_stateless/02_ddl/02_0000_create_drop_view.py new file mode 100755 index 000000000000..93904e8fb865 --- /dev/null +++ b/tests/suites/0_stateless/02_ddl/02_0000_create_drop_view.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +import sqlalchemy +import os +from concurrent.futures import ThreadPoolExecutor, as_completed + +def recreate_view(con): + with con.begin() as c: + c.execute(sqlalchemy.text("DROP VIEW IF EXISTS v_issue_16188")) + with con.begin() as c: + c.execute(sqlalchemy.text("CREATE OR REPLACE VIEW v_issue_16188 as select a,b from t_issue_16188")) + +def main(): + tcp_port = os.getenv("QUERY_MYSQL_HANDLER_PORT") + if tcp_port is None: + port = "3307" + else: + port = tcp_port + + uri = "mysql+pymysql://root:root@localhost:" + port + "/" + con = sqlalchemy.create_engine(uri, future=True) + with con.begin() as c: + c.execute(sqlalchemy.text("DROP TABLE IF EXISTS t_issue_16188")) + c.execute(sqlalchemy.text("CREATE TABLE t_issue_16188 (a int not null, b int not null)")) + + with ThreadPoolExecutor(max_workers=64) as executor: + futures = [] + for _ in range(10): + futures.append(executor.submit(recreate_view, con)) + + for future in as_completed(futures): + future.result() + +if __name__ == '__main__': + main() diff --git a/tests/suites/0_stateless/02_ddl/02_0000_create_drop_view.result b/tests/suites/0_stateless/02_ddl/02_0000_create_drop_view.result new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/suites/1_stateful/09_http_handler/09_0007_token.py b/tests/suites/1_stateful/09_http_handler/09_0007_token.py index 4ed83ac7bc1b..0b6969901fed 100755 --- a/tests/suites/1_stateful/09_http_handler/09_0007_token.py +++ b/tests/suites/1_stateful/09_http_handler/09_0007_token.py @@ -53,7 +53,7 @@ def do_logout(_case_id, session_token): def do_verify(session_token): - for token in [session_token, 'xxx']: + for token in [session_token, "xxx"]: print("---- verify token ", token) response = requests.get( verify_url, @@ -62,7 +62,7 @@ def do_verify(session_token): print(response.status_code) print(response.text) - for a in [auth, ('u', 'p')]: + for a in [auth, ("u", "p")]: print("---- verify password: ", a) response = requests.post( verify_url, @@ -116,9 +116,12 @@ def fake_expired_token(ty): "nonce": "", "sid": "", } - return "bend-v1-" + ty + '-' + base64.b64encode( - json.dumps(expired_claim).encode("utf-8") - ).decode("utf-8") + return ( + "bend-v1-" + + ty + + "-" + + base64.b64encode(json.dumps(expired_claim).encode("utf-8")).decode("utf-8") + ) def main(): @@ -134,7 +137,6 @@ def main(): pprint(query_resp.get("session").get("need_sticky")) pprint(query_resp.get("session").get("need_refresh")) - # cluster query_resp = do_query("select count(*) from system.clusters", session_token) num_nodes = int(query_resp.get("data")[0][0]) @@ -156,7 +158,7 @@ def main(): # errors do_query("select 2", "xxx") do_query("select 3", "bend-v1-s-xxx") - do_query("select 4", fake_expired_token('s')) + do_query("select 4", fake_expired_token("s")) do_query("select 5", refresh_token) renew_resp = do_refresh(1, refresh_token, session_token) @@ -174,7 +176,7 @@ def main(): # errors do_refresh(2, "xxx", session_token) do_refresh(3, "bend-v1-xxx", session_token) - do_refresh(4, fake_expired_token('r'), session_token) + do_refresh(4, fake_expired_token("r"), session_token) do_refresh(5, session_token, session_token) # test new_refresh_token works