Skip to content

Commit

Permalink
upmerge
Browse files Browse the repository at this point in the history
  • Loading branch information
andygrove committed Aug 22, 2023
2 parents 8e226dc + 870857a commit c703526
Show file tree
Hide file tree
Showing 51 changed files with 506 additions and 542 deletions.
1 change: 1 addition & 0 deletions benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ snmalloc = ["snmalloc-rs"]
[dependencies]
arrow = { workspace = true }
datafusion = { path = "../datafusion/core", version = "30.0.0" }
datafusion-common = { path = "../datafusion/common", version = "30.0.0" }
env_logger = "0.10"
futures = "0.3"
log = "^0.4"
Expand Down
7 changes: 3 additions & 4 deletions benchmarks/src/tpch/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,16 @@ use super::get_query_sql;
use crate::{BenchmarkRun, CommonOpt};
use arrow::record_batch::RecordBatch;
use arrow::util::pretty::{self, pretty_format_batches};
use datafusion::datasource::file_format::csv::{CsvFormat, DEFAULT_CSV_EXTENSION};
use datafusion::datasource::file_format::parquet::{
ParquetFormat, DEFAULT_PARQUET_EXTENSION,
};
use datafusion::datasource::file_format::csv::CsvFormat;
use datafusion::datasource::file_format::parquet::ParquetFormat;
use datafusion::datasource::file_format::FileFormat;
use datafusion::datasource::listing::{
ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
};
use datafusion::datasource::{MemTable, TableProvider};
use datafusion::physical_plan::display::DisplayableExecutionPlan;
use datafusion::physical_plan::{collect, displayable};
use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION};
use log::info;

use std::path::PathBuf;
Expand Down
9 changes: 9 additions & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion datafusion-examples/examples/csv_opener.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ use std::{sync::Arc, vec};
use datafusion::{
assert_batches_eq,
datasource::{
file_format::file_type::FileCompressionType,
listing::PartitionedFile,
object_store::ObjectStoreUrl,
physical_plan::{CsvConfig, CsvOpener, FileScanConfig, FileStream},
Expand All @@ -29,6 +28,7 @@ use datafusion::{
physical_plan::metrics::ExecutionPlanMetricsSet,
test_util::aggr_test_schema,
};
use datafusion_common::FileCompressionType;
use futures::StreamExt;
use object_store::local::LocalFileSystem;

Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/csv_sql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
// specific language governing permissions and limitations
// under the License.

use datafusion::datasource::file_format::file_type::FileCompressionType;
use datafusion::error::Result;
use datafusion::prelude::*;
use datafusion_common::FileCompressionType;

/// This example demonstrates executing a simple query against an Arrow data source (CSV) and
/// fetching results
Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/dataframe-to-s3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
// specific language governing permissions and limitations
// under the License.

use datafusion::datasource::file_format::file_type::{FileType, GetExt};
use datafusion::datasource::file_format::parquet::ParquetFormat;
use datafusion::datasource::listing::ListingOptions;
use datafusion::error::Result;
use datafusion::prelude::*;
use datafusion_common::{FileType, GetExt};

use object_store::aws::AmazonS3Builder;
use std::env;
Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/json_opener.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ use arrow_schema::{DataType, Field, Schema};
use datafusion::{
assert_batches_eq,
datasource::{
file_format::file_type::FileCompressionType,
listing::PartitionedFile,
object_store::ObjectStoreUrl,
physical_plan::{FileScanConfig, FileStream, JsonOpener},
},
error::Result,
physical_plan::metrics::ExecutionPlanMetricsSet,
};
use datafusion_common::FileCompressionType;
use futures::StreamExt;
use object_store::ObjectStore;

Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/parquet_sql_multiple_files.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
// specific language governing permissions and limitations
// under the License.

use datafusion::datasource::file_format::file_type::{FileType, GetExt};
use datafusion::datasource::file_format::parquet::ParquetFormat;
use datafusion::datasource::listing::ListingOptions;
use datafusion::error::Result;
use datafusion::prelude::*;
use datafusion_common::{FileType, GetExt};
use std::sync::Arc;

/// This example demonstrates executing a simple query against an Arrow data source (a directory
Expand Down
17 changes: 16 additions & 1 deletion datafusion/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,34 @@ path = "src/lib.rs"

[features]
avro = ["apache-avro"]
default = []
compression = ["xz2", "bzip2", "flate2", "zstd", "async-compression"]
default = ["compression"]
pyarrow = ["pyo3", "arrow/pyarrow"]

[dependencies]
apache-avro = { version = "0.15", default-features = false, features = ["snappy"], optional = true }
arrow = { workspace = true }
arrow-array = { workspace = true }
async-compression = { version = "0.4.0", features = ["bzip2", "gzip", "xz", "zstd", "futures-io", "tokio"], optional = true }
bytes = "1.4"
bzip2 = { version = "0.4.3", optional = true }
chrono = { version = "0.4", default-features = false }
flate2 = { version = "1.0.24", optional = true }
futures = "0.3"
num_cpus = "1.13.0"
object_store = { version = "0.6.1", default-features = false, optional = true }
parquet = { workspace = true, optional = true }
pyo3 = { version = "0.19.0", optional = true }
sqlparser = { workspace = true }
tokio = { version = "1.28", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] }
tokio-util = { version = "0.7.4", features = ["io"] }
xz2 = { version = "0.1", optional = true }
zstd = { version = "0.12", optional = true }






[dev-dependencies]
rand = "0.8.4"
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,6 @@

//! File type abstraction

use crate::common::{internal_err, not_impl_err};
use crate::datasource::file_format::arrow::DEFAULT_ARROW_EXTENSION;
use crate::datasource::file_format::avro::DEFAULT_AVRO_EXTENSION;
use crate::datasource::file_format::csv::DEFAULT_CSV_EXTENSION;
use crate::datasource::file_format::json::DEFAULT_JSON_EXTENSION;
use crate::datasource::file_format::parquet::DEFAULT_PARQUET_EXTENSION;
use crate::error::{DataFusionError, Result};
#[cfg(feature = "compression")]
use async_compression::tokio::bufread::{
Expand All @@ -32,19 +26,21 @@ use async_compression::tokio::bufread::{
ZstdDecoder as AsyncZstdDecoer, ZstdEncoder as AsyncZstdEncoder,
};

use crate::parsers::CompressionTypeVariant;
#[cfg(feature = "compression")]
use async_compression::tokio::write::{BzEncoder, GzipEncoder, XzEncoder, ZstdEncoder};
use bytes::Bytes;
#[cfg(feature = "compression")]
use bzip2::read::MultiBzDecoder;
use datafusion_common::parsers::CompressionTypeVariant;
#[cfg(feature = "compression")]
use flate2::read::MultiGzDecoder;

use core::fmt;
use futures::stream::BoxStream;
use futures::StreamExt;
#[cfg(feature = "compression")]
use futures::TryStreamExt;
use std::fmt::Display;
use std::str::FromStr;
use tokio::io::AsyncWrite;
#[cfg(feature = "compression")]
Expand All @@ -55,6 +51,17 @@ use xz2::read::XzDecoder;
use zstd::Decoder as ZstdDecoder;
use CompressionTypeVariant::*;

/// The default file extension of arrow files
pub const DEFAULT_ARROW_EXTENSION: &str = ".arrow";
/// The default file extension of avro files
pub const DEFAULT_AVRO_EXTENSION: &str = ".avro";
/// The default file extension of csv files
pub const DEFAULT_CSV_EXTENSION: &str = ".csv";
/// The default file extension of json files
pub const DEFAULT_JSON_EXTENSION: &str = ".json";
/// The default file extension of parquet files
pub const DEFAULT_PARQUET_EXTENSION: &str = ".parquet";

/// Define each `FileType`/`FileCompressionType`'s extension
pub trait GetExt {
/// File extension getter
Expand Down Expand Up @@ -144,7 +151,7 @@ impl FileCompressionType {
.boxed(),
#[cfg(not(feature = "compression"))]
GZIP | BZIP2 | XZ | ZSTD => {
return not_impl_err!("Compression feature is not enabled")
return crate::error::_not_impl_err!("Compression feature is not enabled")
}
UNCOMPRESSED => s.boxed(),
})
Expand All @@ -167,7 +174,7 @@ impl FileCompressionType {
ZSTD => Box::new(ZstdEncoder::new(w)),
#[cfg(not(feature = "compression"))]
GZIP | BZIP2 | XZ | ZSTD => {
return not_impl_err!("Compression feature is not enabled")
return crate::error::_not_impl_err!("Compression feature is not enabled")
}
UNCOMPRESSED => w,
})
Expand Down Expand Up @@ -197,7 +204,7 @@ impl FileCompressionType {
.boxed(),
#[cfg(not(feature = "compression"))]
GZIP | BZIP2 | XZ | ZSTD => {
return not_impl_err!("Compression feature is not enabled")
return crate::error::_not_impl_err!("Compression feature is not enabled")
}
UNCOMPRESSED => s.boxed(),
})
Expand All @@ -222,15 +229,15 @@ impl FileCompressionType {
},
#[cfg(not(feature = "compression"))]
GZIP | BZIP2 | XZ | ZSTD => {
return not_impl_err!("Compression feature is not enabled")
return crate::error::_not_impl_err!("Compression feature is not enabled")
}
UNCOMPRESSED => Box::new(r),
})
}
}

/// Readable file type
#[derive(Debug, Clone, PartialEq, Eq)]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum FileType {
/// Apache Arrow file
ARROW,
Expand All @@ -256,6 +263,19 @@ impl GetExt for FileType {
}
}

impl Display for FileType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let out = match self {
FileType::CSV => "csv",
FileType::JSON => "json",
FileType::PARQUET => "parquet",
FileType::AVRO => "avro",
FileType::ARROW => "arrow",
};
write!(f, "{}", out)
}
}

impl FromStr for FileType {
type Err = DataFusionError;

Expand All @@ -267,7 +287,7 @@ impl FromStr for FileType {
"PARQUET" => Ok(FileType::PARQUET),
"CSV" => Ok(FileType::CSV),
"JSON" | "NDJSON" => Ok(FileType::JSON),
_ => not_impl_err!("Unknown FileType: {s}"),
_ => crate::error::_not_impl_err!("Unknown FileType: {s}"),
}
}
}
Expand All @@ -281,7 +301,7 @@ impl FileType {
FileType::JSON | FileType::CSV => Ok(format!("{}{}", ext, c.get_ext())),
FileType::PARQUET | FileType::AVRO | FileType::ARROW => match c.variant {
UNCOMPRESSED => Ok(ext),
_ => internal_err!(
_ => crate::error::_internal_err!(
"FileCompressionType can be specified for CSV/JSON FileType."
),
},
Expand All @@ -291,8 +311,8 @@ impl FileType {

#[cfg(test)]
mod tests {
use crate::datasource::file_format::file_type::{FileCompressionType, FileType};
use crate::error::DataFusionError;
use crate::file_type::{FileCompressionType, FileType};
use std::str::FromStr;

#[test]
Expand Down
6 changes: 6 additions & 0 deletions datafusion/common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub mod delta;
mod dfschema;
pub mod display;
mod error;
pub mod file_type;
pub mod format;
mod functional_dependencies;
mod join_type;
Expand All @@ -44,6 +45,11 @@ pub use error::{
field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError,
SharedResult,
};
pub use file_type::{
FileCompressionType, FileType, GetExt, DEFAULT_ARROW_EXTENSION,
DEFAULT_AVRO_EXTENSION, DEFAULT_CSV_EXTENSION, DEFAULT_JSON_EXTENSION,
DEFAULT_PARQUET_EXTENSION,
};
pub use functional_dependencies::{
aggregate_functional_dependencies, get_target_functional_dependencies, Constraints,
Dependency, FunctionalDependence, FunctionalDependencies,
Expand Down
9 changes: 4 additions & 5 deletions datafusion/core/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ use arrow::array::{Array, ArrayRef, Int64Array, StringArray};
use arrow::compute::{cast, concat};
use arrow::datatypes::{DataType, Field};
use async_trait::async_trait;
use datafusion_common::{DataFusionError, SchemaError, UnnestOptions};
use datafusion_expr::dml::OutputFileFormat;
use datafusion_common::{DataFusionError, FileType, SchemaError, UnnestOptions};
use parquet::file::properties::WriterProperties;

use datafusion_common::{Column, DFSchema, ScalarValue};
Expand Down Expand Up @@ -999,7 +998,7 @@ impl DataFrame {
let plan = LogicalPlanBuilder::copy_to(
self.plan,
path.into(),
OutputFileFormat::CSV,
FileType::CSV,
true,
// TODO implement options
vec![],
Expand All @@ -1017,7 +1016,7 @@ impl DataFrame {
let plan = LogicalPlanBuilder::copy_to(
self.plan,
path.into(),
OutputFileFormat::PARQUET,
FileType::PARQUET,
true,
// TODO implement options
vec![],
Expand All @@ -1034,7 +1033,7 @@ impl DataFrame {
let plan = LogicalPlanBuilder::copy_to(
self.plan,
path.into(),
OutputFileFormat::JSON,
FileType::JSON,
true,
// TODO implement options
vec![],
Expand Down
2 changes: 0 additions & 2 deletions datafusion/core/src/datasource/file_format/arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ use std::any::Any;
use std::io::{Read, Seek};
use std::sync::Arc;

/// The default file extension of arrow files
pub const DEFAULT_ARROW_EXTENSION: &str = ".arrow";
/// Arrow `FileFormat` implementation.
#[derive(Default, Debug)]
pub struct ArrowFormat;
Expand Down
2 changes: 0 additions & 2 deletions datafusion/core/src/datasource/file_format/avro.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ use crate::execution::context::SessionState;
use crate::physical_plan::ExecutionPlan;
use crate::physical_plan::Statistics;

/// The default file extension of avro files
pub const DEFAULT_AVRO_EXTENSION: &str = ".avro";
/// Avro `FileFormat` implementation.
#[derive(Default, Debug)]
pub struct AvroFormat;
Expand Down
Loading

0 comments on commit c703526

Please sign in to comment.