From e845a3cc1dcbbceda86bb7fe132c5040d23ce78b Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 13 Sep 2024 08:35:48 -0400 Subject: [PATCH] `sample`: add --max-size option to support partial sampling of CSV data on remote URLs --- README.md | 2 +- src/cmd/sample.rs | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 845a05af6..76cb668aa 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ | [replace](/src/cmd/replace.rs#L2)
👆 | Replace CSV data using a regex. Applies the regex to each field individually. | | [reverse](/src/cmd/reverse.rs#L2)
📇🤯 | Reverse order of rows in a CSV. Unlike the `sort --reverse` command, it preserves the order of rows with the same key. If an index is present, it works with constant memory. Otherwise, it will load all the data into memory. | | [safenames](/src/cmd/safenames.rs#L2)
![CKAN](docs/images/ckan.png) | Modify headers of a CSV to only have ["safe" names](/src/cmd/safenames.rs#L5-L14) - guaranteed "database-ready"/"CKAN-ready" names. | -| [sample](/src/cmd/sample.rs#L2)
📇🌐🏎️ | Randomly draw rows (with optional seed) from a CSV using [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling), using memory proportional to the sample size. If an index is present, using random indexing with constant memory. | +| [sample](/src/cmd/sample.rs#L2)
📇🌐🏎️ | Randomly draw rows (with optional seed) from a CSV using [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling), using memory proportional to the sample size. If an index is present, using random indexing with constant memory. Supports sampling from CSVs on remote URLs. | | [schema](/src/cmd/schema.rs#L2)
📇😣🏎️👆🪄 | Infer schema from CSV data, replete with data type & domain/range validation & output in [JSON Schema](https://json-schema.org/) format. Uses multithreading to go faster if an index is present. See `validate` command to use the generated JSON Schema to validate if similar CSVs comply with the schema. | | [search](/src/cmd/search.rs#L2)
👆 | Run a regex over a CSV. Applies the regex to selected fields & shows only matching rows. | | [searchset](/src/cmd/searchset.rs#L2)
👆 | _Run multiple regexes over a CSV in a single pass._ Applies the regexes to each field individually & shows only matching rows. | diff --git a/src/cmd/sample.rs b/src/cmd/sample.rs index 77efaff69..a69988550 100644 --- a/src/cmd/sample.rs +++ b/src/cmd/sample.rs @@ -10,6 +10,8 @@ which is necessary to provide a uniform random sample (reservoir sampling). If you wish to limit the number of records visited, use the 'qsv slice' command to pipe into 'qsv sample'. +Also supports sampling from CSVs on remote URLs. + This command is intended to provide a means to sample from a CSV data set that is too big to fit into memory (for example, for use with commands like 'qsv stats' with the '--everything' option). @@ -45,6 +47,7 @@ sample options: 2.1 GB/s throughput though slow initialization. [default: standard] + REMOTE FILE OPTIONS: --user-agent Specify custom user agent to use when the input is a URL. It supports the following variables - $QSV_VERSION, $QSV_TARGET, $QSV_BIN_NAME, $QSV_KIND and $QSV_COMMAND. @@ -52,6 +55,10 @@ sample options: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent --timeout Timeout for downloading URLs in seconds. [default: 30] + --max-size Maximum size of the file to download in MB before sampling. + Will download the entire file if not specified. + If the CSV is partially downloaded, the sample will be taken + only from the downloaded portion. Common options: -h, --help Display this message @@ -90,6 +97,7 @@ struct Args { flag_rng: String, flag_user_agent: Option, flag_timeout: Option, + flag_max_size: Option, } #[derive(Debug, EnumString, PartialEq)] @@ -103,10 +111,6 @@ enum RngKind { pub fn run(argv: &[&str]) -> CliResult<()> { let mut args: Args = util::get_args(USAGE, argv)?; - if args.arg_sample_size.is_sign_negative() { - return fail_incorrectusage_clierror!("Sample size cannot be negative."); - } - let Ok(rng_kind) = RngKind::from_str(&args.flag_rng) else { return fail_incorrectusage_clierror!( "Invalid RNG algorithm `{}`. Supported RNGs are: standard, faster, cryptosecure.", @@ -119,6 +123,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> { args.arg_input = match args.arg_input { Some(uri) => { if Url::parse(&uri).is_ok() && uri.starts_with("http") { + let max_size_bytes = args.flag_max_size.map(|mb| mb * 1024 * 1024); + // its a remote file, download it first let future = util::download_file( &uri, @@ -126,7 +132,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { false, args.flag_user_agent, args.flag_timeout, - None, + max_size_bytes, ); tokio::runtime::Runtime::new()?.block_on(future)?; // safety: temp_download is a NamedTempFile, so we know it can be converted to a @@ -143,7 +149,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let rconfig = Config::new(&args.arg_input) .delimiter(args.flag_delimiter) - .no_headers(args.flag_no_headers); + .no_headers(args.flag_no_headers) + .flexible(true); let mut sample_size = args.arg_sample_size;