Parallelize Parquet Serialization (#7562)

* initial implementation * cargo fmt * unbounded channel and flush worker * disable parallelism by default * update configs.md * fix information_schema test
apache · Sep 18, 2023 · 5718a3f · 5718a3f
1 parent f4c4ee1
commit 5718a3f
Show file tree

Hide file tree

Showing 4 changed files with 341 additions and 57 deletions.
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -353,6 +353,15 @@ config_namespace! {
         /// Sets bloom filter number of distinct values. If NULL, uses
         /// default parquet writer setting
         pub bloom_filter_ndv: Option<u64>, default = None
+
+        /// Controls whether DataFusion will attempt to speed up writing
+        /// large parquet files by first writing multiple smaller files
+        /// and then stitching them together into a single large file.
+        /// This will result in faster write speeds, but higher memory usage.
+        /// Also currently unsupported are bloom filters and column indexes
+        /// when single_file_parallelism is enabled.
+        pub allow_single_file_parallelism: bool, default = false
+
     }
 }