From e845a3cc1dcbbceda86bb7fe132c5040d23ce78b Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Fri, 13 Sep 2024 08:35:48 -0400
Subject: [PATCH] `sample`: add --max-size option
to support partial sampling of CSV data on remote URLs
---
README.md | 2 +-
src/cmd/sample.rs | 19 +++++++++++++------
2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index 845a05af6..76cb668aa 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@
| [replace](/src/cmd/replace.rs#L2)
👆 | Replace CSV data using a regex. Applies the regex to each field individually. |
| [reverse](/src/cmd/reverse.rs#L2)
📇🤯 | Reverse order of rows in a CSV. Unlike the `sort --reverse` command, it preserves the order of rows with the same key. If an index is present, it works with constant memory. Otherwise, it will load all the data into memory. |
| [safenames](/src/cmd/safenames.rs#L2)
![CKAN](docs/images/ckan.png) | Modify headers of a CSV to only have ["safe" names](/src/cmd/safenames.rs#L5-L14) - guaranteed "database-ready"/"CKAN-ready" names. |
-| [sample](/src/cmd/sample.rs#L2)
📇🌐🏎️ | Randomly draw rows (with optional seed) from a CSV using [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling), using memory proportional to the sample size. If an index is present, using random indexing with constant memory. |
+| [sample](/src/cmd/sample.rs#L2)
📇🌐🏎️ | Randomly draw rows (with optional seed) from a CSV using [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling), using memory proportional to the sample size. If an index is present, using random indexing with constant memory. Supports sampling from CSVs on remote URLs. |
| [schema](/src/cmd/schema.rs#L2)
📇😣🏎️👆🪄 | Infer schema from CSV data, replete with data type & domain/range validation & output in [JSON Schema](https://json-schema.org/) format. Uses multithreading to go faster if an index is present. See `validate` command to use the generated JSON Schema to validate if similar CSVs comply with the schema. |
| [search](/src/cmd/search.rs#L2)
👆 | Run a regex over a CSV. Applies the regex to selected fields & shows only matching rows. |
| [searchset](/src/cmd/searchset.rs#L2)
👆 | _Run multiple regexes over a CSV in a single pass._ Applies the regexes to each field individually & shows only matching rows. |
diff --git a/src/cmd/sample.rs b/src/cmd/sample.rs
index 77efaff69..a69988550 100644
--- a/src/cmd/sample.rs
+++ b/src/cmd/sample.rs
@@ -10,6 +10,8 @@ which is necessary to provide a uniform random sample (reservoir sampling).
If you wish to limit the number of records visited, use the 'qsv slice' command
to pipe into 'qsv sample'.
+Also supports sampling from CSVs on remote URLs.
+
This command is intended to provide a means to sample from a CSV data set that
is too big to fit into memory (for example, for use with commands like
'qsv stats' with the '--everything' option).
@@ -45,6 +47,7 @@ sample options:
2.1 GB/s throughput though slow initialization.
[default: standard]
+ REMOTE FILE OPTIONS:
--user-agent Specify custom user agent to use when the input is a URL.
It supports the following variables -
$QSV_VERSION, $QSV_TARGET, $QSV_BIN_NAME, $QSV_KIND and $QSV_COMMAND.
@@ -52,6 +55,10 @@ sample options:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
--timeout Timeout for downloading URLs in seconds.
[default: 30]
+ --max-size Maximum size of the file to download in MB before sampling.
+ Will download the entire file if not specified.
+ If the CSV is partially downloaded, the sample will be taken
+ only from the downloaded portion.
Common options:
-h, --help Display this message
@@ -90,6 +97,7 @@ struct Args {
flag_rng: String,
flag_user_agent: Option,
flag_timeout: Option,
+ flag_max_size: Option,
}
#[derive(Debug, EnumString, PartialEq)]
@@ -103,10 +111,6 @@ enum RngKind {
pub fn run(argv: &[&str]) -> CliResult<()> {
let mut args: Args = util::get_args(USAGE, argv)?;
- if args.arg_sample_size.is_sign_negative() {
- return fail_incorrectusage_clierror!("Sample size cannot be negative.");
- }
-
let Ok(rng_kind) = RngKind::from_str(&args.flag_rng) else {
return fail_incorrectusage_clierror!(
"Invalid RNG algorithm `{}`. Supported RNGs are: standard, faster, cryptosecure.",
@@ -119,6 +123,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
args.arg_input = match args.arg_input {
Some(uri) => {
if Url::parse(&uri).is_ok() && uri.starts_with("http") {
+ let max_size_bytes = args.flag_max_size.map(|mb| mb * 1024 * 1024);
+
// its a remote file, download it first
let future = util::download_file(
&uri,
@@ -126,7 +132,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
false,
args.flag_user_agent,
args.flag_timeout,
- None,
+ max_size_bytes,
);
tokio::runtime::Runtime::new()?.block_on(future)?;
// safety: temp_download is a NamedTempFile, so we know it can be converted to a
@@ -143,7 +149,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let rconfig = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
- .no_headers(args.flag_no_headers);
+ .no_headers(args.flag_no_headers)
+ .flexible(true);
let mut sample_size = args.arg_sample_size;