Skip to content

Commit

Permalink
feat: add ability to query the remote http(s) location directly in da…
Browse files Browse the repository at this point in the history
…tafusion-cli (#9150)

* implement querying the remote http location directly

Signed-off-by: Nikolay Ulmasov <[email protected]>

* fix format

Signed-off-by: Nikolay Ulmasov <[email protected]>

---------

Signed-off-by: Nikolay Ulmasov <[email protected]>
  • Loading branch information
r3stl355 committed Feb 9, 2024
1 parent 701e0dd commit 071dc99
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 4 deletions.
77 changes: 76 additions & 1 deletion datafusion-cli/src/catalog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@ use datafusion::datasource::listing::{
use datafusion::datasource::TableProvider;
use datafusion::error::Result;
use datafusion::execution::context::SessionState;
use object_store::http::HttpBuilder;
use object_store::ObjectStore;
use parking_lot::RwLock;
use std::any::Any;
use std::sync::{Arc, Weak};
use url::Url;

/// Wraps another catalog, automatically creating table providers
/// for local files if needed
Expand Down Expand Up @@ -151,10 +154,35 @@ impl SchemaProvider for DynamicFileSchemaProvider {
// if the inner schema provider didn't have a table by
// that name, try to treat it as a listing table
let state = self.state.upgrade()?.read().clone();
let config = ListingTableConfig::new(ListingTableUrl::parse(name).ok()?)
let table_url = ListingTableUrl::parse(name).ok()?;

// Assure the `http` store for this url is registered if this
// is an `http(s)` listing
// TODO: support for other types, e.g. `s3`, may need to be added
match table_url.scheme() {
"http" | "https" => {
let url: &Url = table_url.as_ref();
match state.runtime_env().object_store_registry.get_store(url) {
Ok(_) => {}
Err(_) => {
let store = Arc::new(
HttpBuilder::new()
.with_url(url.origin().ascii_serialization())
.build()
.ok()?,
) as Arc<dyn ObjectStore>;
state.runtime_env().register_object_store(url, store);
}
}
}
_ => {}
}

let config = ListingTableConfig::new(table_url)
.infer(&state)
.await
.ok()?;

Some(Arc::new(ListingTable::try_new(config).ok()?))
}

Expand All @@ -166,3 +194,50 @@ impl SchemaProvider for DynamicFileSchemaProvider {
self.inner.table_exist(name)
}
}

#[cfg(test)]
mod tests {
use super::*;
use datafusion::prelude::SessionContext;

#[tokio::test]
async fn query_http_location_test() -> Result<()> {
// Perhaps this could be changed to use an existing file but
// that will require a permanently availalble web resource
let domain = "example.com";
let location = format!("http://{domain}/file.parquet");

let mut ctx = SessionContext::new();
ctx.register_catalog_list(Arc::new(DynamicFileCatalog::new(
ctx.state().catalog_list(),
ctx.state_weak_ref(),
)));

let provider =
&DynamicFileCatalog::new(ctx.state().catalog_list(), ctx.state_weak_ref())
as &dyn CatalogProviderList;
let catalog = provider
.catalog(provider.catalog_names().first().unwrap())
.unwrap();
let schema = catalog
.schema(catalog.schema_names().first().unwrap())
.unwrap();
let none = schema.table(&location).await;

// That's a non-existing location so expecting None here
assert!(none.is_none());

// It should still create an object store for the location
let store = ctx
.runtime_env()
.object_store(ListingTableUrl::parse(location)?)?;

assert_eq!(format!("{store}"), "HttpStore");

// The store must be configured for this domain
let expected_domain = format!("Domain(\"{domain}\")");
assert!(format!("{store:?}").contains(&expected_domain));

Ok(())
}
}
19 changes: 16 additions & 3 deletions docs/source/user-guide/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,10 @@ OPTIONS:

## Querying data from the files directly

Files can be queried directly by enclosing the file or
directory name in single `'` quotes as shown in the example.
Files can be queried directly by enclosing the file, directory name
or a remote location in single `'` quotes as shown in the examples.

## Example
## Examples

Create a CSV file to query.

Expand Down Expand Up @@ -194,6 +194,19 @@ DataFusion CLI v16.0.0
2 rows in set. Query took 0.007 seconds.
```
You can also query directly from the remote location via HTTP(S) without
registering the location as a table
```sql
select count(*) from 'https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_1.parquet'
+----------+
| COUNT(*) |
+----------+
| 1000000 |
+----------+
1 row in set. Query took 0.595 seconds.
```
## Creating External Tables
It is also possible to create a table backed by files by explicitly
Expand Down

0 comments on commit 071dc99

Please sign in to comment.