Merge commit '8f8e105072ae786c77537ca487f28c292c0cf0e4' into chunchun…

…/update-df-apr-week-4
influxdata · Apr 29, 2024 · dd072a9 · dd072a9
2 parents ab641a0 + 8f8e105
commit dd072a9
Show file tree

Hide file tree

Showing 193 changed files with 7,129 additions and 7,322 deletions.
diff --git a/.github/actions/setup-windows-builder/action.yaml b/.github/actions/setup-windows-builder/action.yaml
@@ -38,7 +38,7 @@ runs:
     - name: Setup Rust toolchain
       shell: bash
       run: |
-        # Avoid self update to avoid CI failures: https://github.com/apache/arrow-datafusion/issues/9653
+        # Avoid self update to avoid CI failures: https://github.com/apache/datafusion/issues/9653
         rustup toolchain install stable --no-self-update
         rustup default stable
         rustup component add rustfmt

diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml
@@ -34,7 +34,7 @@ jobs:
     runs-on: ubuntu-latest
     # only run for users whose permissions allow them to update PRs
     # otherwise labeler is failing:
-    # https://github.com/apache/arrow-datafusion/issues/3743
+    # https://github.com/apache/datafusion/issues/3743
     permissions:
       contents: read
       pull-requests: write

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -425,7 +425,7 @@ jobs:
           ci/scripts/rust_fmt.sh
 
   # Coverage job disabled due to
-  # https://github.com/apache/arrow-datafusion/issues/3678
+  # https://github.com/apache/datafusion/issues/3678
 
   # coverage:
   #   name: coverage

diff --git a/Cargo.toml b/Cargo.toml
@@ -46,10 +46,10 @@ resolver = "2"
 [workspace.package]
 authors = ["Apache Arrow <[email protected]>"]
 edition = "2021"
-homepage = "https://github.com/apache/arrow-datafusion"
+homepage = "https://github.com/apache/datafusion"
 license = "Apache-2.0"
 readme = "README.md"
-repository = "https://github.com/apache/arrow-datafusion"
+repository = "https://github.com/apache/datafusion"
 rust-version = "1.73"
 version = "37.1.0"
 

diff --git a/README.md b/README.md
@@ -27,22 +27,22 @@
 [crates-badge]: https://img.shields.io/crates/v/datafusion.svg
 [crates-url]: https://crates.io/crates/datafusion
 [license-badge]: https://img.shields.io/badge/license-Apache%20v2-blue.svg
-[license-url]: https://github.com/apache/arrow-datafusion/blob/main/LICENSE.txt
-[actions-badge]: https://github.com/apache/arrow-datafusion/actions/workflows/rust.yml/badge.svg
-[actions-url]: https://github.com/apache/arrow-datafusion/actions?query=branch%3Amain
+[license-url]: https://github.com/apache/datafusion/blob/main/LICENSE.txt
+[actions-badge]: https://github.com/apache/datafusion/actions/workflows/rust.yml/badge.svg
+[actions-url]: https://github.com/apache/datafusion/actions?query=branch%3Amain
 [discord-badge]: https://img.shields.io/discord/885562378132000778.svg?logo=discord&style=flat-square
 [discord-url]: https://discord.com/invite/Qw5gKqHxUM
 
-[Website](https://github.com/apache/arrow-datafusion) |
-[Guides](https://github.com/apache/arrow-datafusion/tree/main/docs) |
+[Website](https://github.com/apache/datafusion) |
+[Guides](https://github.com/apache/datafusion/tree/main/docs) |
 [API Docs](https://docs.rs/datafusion/latest/datafusion/) |
 [Chat](https://discord.com/channels/885562378132000778/885562378132000781)
 
 <img src="./docs/source/_static/images/2x_bgwhite_original.png" width="512" alt="logo"/>
 
 Apache DataFusion is a very fast, extensible query engine for building high-quality data-centric systems in
 [Rust](http://rustlang.org), using the [Apache Arrow](https://arrow.apache.org)
-in-memory format. [Python Bindings](https://github.com/apache/arrow-datafusion-python) are also available. DataFusion offers SQL and Dataframe APIs, excellent [performance](https://benchmark.clickhouse.com/), built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and a great community.
+in-memory format. [Python Bindings](https://github.com/apache/datafusion-python) are also available. DataFusion offers SQL and Dataframe APIs, excellent [performance](https://benchmark.clickhouse.com/), built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and a great community.
 
 Here are links to some important information
 
@@ -51,7 +51,7 @@ Here are links to some important information
 - [Rust Getting Started](https://arrow.apache.org/datafusion/user-guide/example-usage.html)
 - [Rust DataFrame API](https://arrow.apache.org/datafusion/user-guide/dataframe.html)
 - [Rust API docs](https://docs.rs/datafusion/latest/datafusion)
-- [Rust Examples](https://github.com/apache/arrow-datafusion/tree/master/datafusion-examples)
+- [Rust Examples](https://github.com/apache/datafusion/tree/master/datafusion-examples)
 - [Python DataFrame API](https://arrow.apache.org/datafusion-python/)
 - [Architecture](https://docs.rs/datafusion/latest/datafusion/index.html#architecture)
 
@@ -102,4 +102,4 @@ each stable Rust version for 6 months after it is
 [released](https://github.com/rust-lang/rust/blob/master/RELEASES.md). This
 generally translates to support for the most recent 3 to 4 stable Rust versions.
 
-We enforce this policy using a [MSRV CI Check](https://github.com/search?q=repo%3Aapache%2Farrow-datafusion+rust-version+language%3ATOML+path%3A%2F%5ECargo.toml%2F&type=code)
+We enforce this policy using a [MSRV CI Check](https://github.com/search?q=repo%3Aapache%2Fdatafusion+rust-version+language%3ATOML+path%3A%2F%5ECargo.toml%2F&type=code)
diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs
@@ -47,7 +47,7 @@ enum TpchOpt {
 /// use `dbbench` instead.
 ///
 /// Note: this is kept to be backwards compatible with the benchmark names prior to
-/// <https://github.com/apache/arrow-datafusion/issues/6994>
+/// <https://github.com/apache/datafusion/issues/6994>
 #[tokio::main]
 async fn main() -> Result<()> {
     env_logger::init();

diff --git a/clippy.toml b/clippy.toml
@@ -1,6 +1,6 @@
 disallowed-methods = [
-    { path = "tokio::task::spawn", reason = "To provide cancel-safety, use `SpawnedTask::spawn` instead (https://github.com/apache/arrow-datafusion/issues/6513)" },
-    { path = "tokio::task::spawn_blocking", reason = "To provide cancel-safety, use `SpawnedTask::spawn_blocking` instead (https://github.com/apache/arrow-datafusion/issues/6513)" },
+    { path = "tokio::task::spawn", reason = "To provide cancel-safety, use `SpawnedTask::spawn` instead (https://github.com/apache/datafusion/issues/6513)" },
+    { path = "tokio::task::spawn_blocking", reason = "To provide cancel-safety, use `SpawnedTask::spawn_blocking` instead (https://github.com/apache/datafusion/issues/6513)" },
 ]
 
 disallowed-types = [

diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
@@ -23,8 +23,8 @@ authors = ["Apache Arrow <[email protected]>"]
 edition = "2021"
 keywords = ["arrow", "datafusion", "query", "sql"]
 license = "Apache-2.0"
-homepage = "https://github.com/apache/arrow-datafusion"
-repository = "https://github.com/apache/arrow-datafusion"
+homepage = "https://github.com/apache/datafusion"
+repository = "https://github.com/apache/datafusion"
 # Specify MSRV here as `cargo msrv` doesn't support workspace version
 rust-version = "1.73"
 readme = "README.md"

diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md
@@ -43,4 +43,4 @@ checked in `Cargo.lock` file to ensure reproducible builds.
 However, the `datafusion` and sub crates are intended for use as libraries and
 thus do not have a `Cargo.lock` file checked in.
 
-[`datafusion cargo.toml`]: https://github.com/apache/arrow-datafusion/blob/main/Cargo.toml
+[`datafusion cargo.toml`]: https://github.com/apache/datafusion/blob/main/Cargo.toml
diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
@@ -30,7 +30,7 @@ Run `git submodule update --init` to init test files.
 To run the examples, use the `cargo run` command, such as:
 
 ```bash
-git clone https://github.com/apache/arrow-datafusion
+git clone https://github.com/apache/datafusion
 cd arrow-datafusion
 # Download test data
 git submodule update --init

diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs
@@ -93,13 +93,13 @@ fn criterion_benchmark(c: &mut Criterion) {
     let ctx = create_context();
 
     // Test simplest
-    // https://github.com/apache/arrow-datafusion/issues/5157
+    // https://github.com/apache/datafusion/issues/5157
     c.bench_function("logical_select_one_from_700", |b| {
         b.iter(|| logical_plan(&ctx, "SELECT c1 FROM t700"))
     });
 
     // Test simplest
-    // https://github.com/apache/arrow-datafusion/issues/5157
+    // https://github.com/apache/datafusion/issues/5157
     c.bench_function("physical_select_one_from_700", |b| {
         b.iter(|| physical_plan(&ctx, "SELECT c1 FROM t700"))
     });

diff --git a/datafusion/core/src/catalog/mod.rs b/datafusion/core/src/catalog/mod.rs
@@ -176,8 +176,8 @@ impl CatalogProviderList for MemoryCatalogProviderList {
 ///  read from Delta Lake tables
 ///
 /// [`datafusion-cli`]: https://arrow.apache.org/datafusion/user-guide/cli.html
-/// [`DynamicFileCatalogProvider`]: https://github.com/apache/arrow-datafusion/blob/31b9b48b08592b7d293f46e75707aad7dadd7cbc/datafusion-cli/src/catalog.rs#L75
-/// [`catalog.rs`]: https://github.com/apache/arrow-datafusion/blob/main/datafusion-examples/examples/catalog.rs
+/// [`DynamicFileCatalogProvider`]: https://github.com/apache/datafusion/blob/31b9b48b08592b7d293f46e75707aad7dadd7cbc/datafusion-cli/src/catalog.rs#L75
+/// [`catalog.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/catalog.rs
 /// [delta-rs]: https://github.com/delta-io/delta-rs
 /// [`UnityCatalogProvider`]: https://github.com/delta-io/delta-rs/blob/951436ecec476ce65b5ed3b58b50fb0846ca7b91/crates/deltalake-core/src/data_catalog/unity/datafusion.rs#L111-L123
 ///

diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
@@ -2423,7 +2423,7 @@ mod tests {
         Ok(())
     }
 
-    // Test issue: https://github.com/apache/arrow-datafusion/issues/7790
+    // Test issue: https://github.com/apache/datafusion/issues/7790
     // The join operation outputs two identical column names, but they belong to different relations.
     #[tokio::test]
     async fn with_column_join_same_columns() -> Result<()> {
@@ -2503,7 +2503,7 @@ mod tests {
     }
 
     // Table 't1' self join
-    // Supplementary test of issue: https://github.com/apache/arrow-datafusion/issues/7790
+    // Supplementary test of issue: https://github.com/apache/datafusion/issues/7790
     #[tokio::test]
     async fn with_column_self_join() -> Result<()> {
         let df = test_table().await?.select_columns(&["c1"])?;

diff --git a/datafusion/core/src/datasource/cte_worktable.rs b/datafusion/core/src/datasource/cte_worktable.rs
@@ -38,7 +38,7 @@ use crate::execution::context::SessionState;
 /// See here for more details: www.postgresql.org/docs/11/queries-with.html#id-1.5.6.12.5.4
 pub struct CteWorkTable {
     /// The name of the CTE work table
-    // WIP, see https://github.com/apache/arrow-datafusion/issues/462
+    // WIP, see https://github.com/apache/datafusion/issues/462
     #[allow(dead_code)]
     name: String,
     /// This schema must be shared across both the static and recursive terms of a recursive query

diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -212,7 +212,7 @@ impl FileFormat for ParquetFormat {
         // object stores (like local file systems) the order returned from list
         // is not deterministic. Thus, to ensure deterministic schema inference
         // sort the files first.
-        // https://github.com/apache/arrow-datafusion/pull/6629
+        // https://github.com/apache/datafusion/pull/6629
         schemas.sort_by(|(location1, _), (location2, _)| location1.cmp(location2));
 
         let schemas = schemas
@@ -1040,7 +1040,7 @@ pub(crate) mod test_util {
         multi_page: bool,
     ) -> Result<(Vec<ObjectMeta>, Vec<NamedTempFile>)> {
         // we need the tmp files to be sorted as some tests rely on the how the returning files are ordered
-        // https://github.com/apache/arrow-datafusion/pull/6629
+        // https://github.com/apache/datafusion/pull/6629
         let tmp_files = {
             let mut tmp_files: Vec<_> = (0..batches.len())
                 .map(|_| NamedTempFile::new().expect("creating temp file"))

diff --git a/datafusion/core/src/datasource/file_format/write/demux.rs b/datafusion/core/src/datasource/file_format/write/demux.rs
@@ -57,7 +57,7 @@ type DemuxedStreamReceiver = UnboundedReceiver<(Path, RecordBatchReceiver)>;
 /// the demux task for errors and abort accordingly. The single_file_ouput parameter
 /// overrides all other settings to force only a single file to be written.
 /// partition_by parameter will additionally split the input based on the unique
-/// values of a specific column `<https://github.com/apache/arrow-datafusion/issues/7744>``
+/// values of a specific column `<https://github.com/apache/datafusion/issues/7744>``
 ///                                                                              ┌───────────┐               ┌────────────┐    ┌─────────────┐
 ///                                                                     ┌──────▶ │  batch 1  ├────▶...──────▶│   Batch a  │    │ Output File1│
 ///                                                                     │        └───────────┘               └────────────┘    └─────────────┘

diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs
@@ -90,16 +90,6 @@ pub fn expr_applicable_for_cols(col_names: &[String], expr: &Expr) -> bool {
 
             Expr::ScalarFunction(scalar_function) => {
                 match &scalar_function.func_def {
-                    ScalarFunctionDefinition::BuiltIn(fun) => {
-                        match fun.volatility() {
-                            Volatility::Immutable => Ok(TreeNodeRecursion::Continue),
-                            // TODO: Stable functions could be `applicable`, but that would require access to the context
-                            Volatility::Stable | Volatility::Volatile => {
-                                is_applicable = false;
-                                Ok(TreeNodeRecursion::Stop)
-                            }
-                        }
-                    }
                     ScalarFunctionDefinition::UDF(fun) => {
                         match fun.signature().volatility {
                             Volatility::Immutable => Ok(TreeNodeRecursion::Continue),

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
@@ -244,7 +244,7 @@ pub struct ListingOptions {
     /// the future be automatically determined, for example using
     /// parquet metadata.
     ///
-    /// See <https://github.com/apache/arrow-datafusion/issues/4177>
+    /// See <https://github.com/apache/datafusion/issues/4177>
     /// NOTE: This attribute stores all equivalent orderings (the outer `Vec`)
     ///       where each ordering consists of an individual lexicographic
     ///       ordering (encapsulated by a `Vec<Expr>`). If there aren't

diff --git a/datafusion/core/src/datasource/listing/url.rs b/datafusion/core/src/datasource/listing/url.rs
@@ -457,7 +457,7 @@ mod tests {
         test("/a/b*.txt", Some(("/a/", "b*.txt")));
         test("/a/b/**/c*.txt", Some(("/a/b/", "**/c*.txt")));
 
-        // https://github.com/apache/arrow-datafusion/issues/2465
+        // https://github.com/apache/datafusion/issues/2465
         test(
             "/a/b/c//alltypes_plain*.parquet",
             Some(("/a/b/c//", "alltypes_plain*.parquet")),

diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs
@@ -769,7 +769,7 @@ mod tests {
         assert_eq!(14, csv.base_config.file_schema.fields().len());
         assert_eq!(14, csv.schema().fields().len());
 
-        // errors due to https://github.com/apache/arrow-datafusion/issues/4918
+        // errors due to https://github.com/apache/datafusion/issues/4918
         let mut it = csv.execute(0, task_ctx)?;
         let err = it.next().await.unwrap().unwrap_err().strip_backtrace();
         assert_eq!(

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs
@@ -49,7 +49,7 @@ use super::ParquetFileMetrics;
 /// did not filter out that row group.
 ///
 /// Note: This method currently ignores ColumnOrder
-/// <https://github.com/apache/arrow-datafusion/issues/8335>
+/// <https://github.com/apache/datafusion/issues/8335>
 pub(crate) fn prune_row_groups_by_statistics(
     arrow_schema: &Schema,
     parquet_schema: &SchemaDescriptor,
@@ -63,7 +63,7 @@ pub(crate) fn prune_row_groups_by_statistics(
         if let Some(range) = &range {
             // figure out where the first dictionary page (or first data page are)
             // note don't use the location of metadata
-            // <https://github.com/apache/arrow-datafusion/issues/5995>
+            // <https://github.com/apache/datafusion/issues/5995>
             let col = metadata.column(0);
             let offset = col
                 .dictionary_page_offset()

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -360,7 +360,7 @@ mod test {
     #[should_panic(
         expected = "Inconsistent types in ScalarValue::iter_to_array. Expected Int64, got TimestampNanosecond(NULL, None)"
     )]
-    // Due to https://github.com/apache/arrow-datafusion/issues/8295
+    // Due to https://github.com/apache/datafusion/issues/8295
     fn roundtrip_timestamp() {
         Test {
             input: timestamp_array([
@@ -470,7 +470,7 @@ mod test {
                 (None, None),
             ]),
         };
-        // Due to https://github.com/apache/arrow-datafusion/issues/8334,
+        // Due to https://github.com/apache/datafusion/issues/8334,
         // statistics for struct arrays are not supported
         test.expected_min =
             new_null_array(test.input.data_type(), test.expected_min.len());
@@ -483,7 +483,7 @@ mod test {
     #[should_panic(
         expected = "Inconsistent types in ScalarValue::iter_to_array. Expected Utf8, got Binary(NULL)"
     )]
-    // Due to https://github.com/apache/arrow-datafusion/issues/8295
+    // Due to https://github.com/apache/datafusion/issues/8295
     fn roundtrip_binary() {
         Test {
             input: Arc::new(BinaryArray::from_opt_vec(vec![

diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view.rs
@@ -158,7 +158,7 @@ mod tests {
 
     #[tokio::test]
     async fn issue_3242() -> Result<()> {
-        // regression test for https://github.com/apache/arrow-datafusion/pull/3242
+        // regression test for https://github.com/apache/datafusion/pull/3242
         let session_ctx = SessionContext::new_with_config(
             SessionConfig::new().with_information_schema(true),
         );

diff --git a/datafusion/core/src/execution/context/avro.rs b/datafusion/core/src/execution/context/avro.rs
@@ -65,7 +65,7 @@ mod tests {
     use async_trait::async_trait;
 
     // Test for compilation error when calling read_* functions from an #[async_trait] function.
-    // See https://github.com/apache/arrow-datafusion/issues/1154
+    // See https://github.com/apache/datafusion/issues/1154
     #[async_trait]
     trait CallReadTrait {
         async fn call_read_avro(&self) -> DataFrame;

diff --git a/datafusion/core/src/execution/context/csv.rs b/datafusion/core/src/execution/context/csv.rs
@@ -127,7 +127,7 @@ mod tests {
     }
 
     // Test for compilation error when calling read_* functions from an #[async_trait] function.
-    // See https://github.com/apache/arrow-datafusion/issues/1154
+    // See https://github.com/apache/datafusion/issues/1154
     #[async_trait]
     trait CallReadTrait {
         async fn call_read_csv(&self) -> DataFrame;

diff --git a/datafusion/core/src/execution/context/parquet.rs b/datafusion/core/src/execution/context/parquet.rs
@@ -333,7 +333,7 @@ mod tests {
     }
 
     // Test for compilation error when calling read_* functions from an #[async_trait] function.
-    // See https://github.com/apache/arrow-datafusion/issues/1154
+    // See https://github.com/apache/datafusion/issues/1154
     #[async_trait]
     trait CallReadTrait {
         async fn call_read_parquet(&self) -> DataFrame;