From cd4ec1264db895414f5da3abf26d807079eb2c2b Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Mon, 23 Sep 2024 08:06:29 -0400
Subject: [PATCH] `validate`: added `dynenum` custom keyword for dynamic
validation lookups
implements #1890
---
src/cmd/validate.rs | 192 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 184 insertions(+), 8 deletions(-)
diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs
index e4fc6958e..90c724f56 100644
--- a/src/cmd/validate.rs
+++ b/src/cmd/validate.rs
@@ -20,8 +20,7 @@ It uses the JSON Schema Validation Specification (draft 2020-12) to validate the
It validates not only the structure of the file, but the data types and domain/range of the
fields as well. See https://json-schema.org/draft/2020-12/json-schema-validation.html
-qsv has added support for a custom format - `currency`. This format will only accept a valid
-currency, defined as:
+qsv supports a custom format - `currency`. This format will only accept a valid currency, defined as:
1. ISO Currency Symbol (optional): This is the ISO 4217 three-character code or currency symbol
(e.g. USD, EUR, JPY, $, €, ¥, etc.)
@@ -31,11 +30,17 @@ currency, defined as:
Negative amounts: ($100.00) or -$100.00
Different styles: 1.000,00 (used in some countries for euros)
+qsv also supports a custom keyword - `dynenum`. It allows for dynamic validation against a CSV.
+This is useful for validating against a set of values that is not known at the time of schema creation
+or when the set of valid values is dynamic or too large to hardcode into the schema.
+`dynenum` can be used to validate against a CSV file on the local filesystem or on a URL (http/https).
+Only the first column of the CSV file is read and used for validation.
+
You can create a JSON Schema file from a reference CSV file using the `qsv schema` command.
Once the schema is created, you can fine-tune it to your needs and use it to validate other CSV
files that have the same structure.
-Be sure to select a “training” CSV file that is representative of the data you want to validate
+Be sure to select a "training" CSV file that is representative of the data you want to validate
when creating a schema. The data types, domain/range and regular expressions inferred from the
reference CSV file should be appropriate for the data you want to validate.
@@ -71,7 +76,8 @@ It also confirms if the CSV is UTF-8 encoded.
For both modes, returns exit code 0 when the CSV file is valid, exitcode > 0 otherwise.
If all records are valid, no output files are produced.
-For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_validate.rs.
+For examples, see the tests included in this file (denoted by '#[test]') or see
+https://github.com/jqnatividad/qsv/blob/master/tests/test_validate.rs.
Usage:
qsv validate [options] [] []
@@ -102,8 +108,8 @@ Validate options:
-b, --batch The number of rows per batch to load into memory,
before running in parallel. Set to 0 to load all rows in one batch.
[default: 50000]
- --timeout Timeout for downloading json-schemas on URLs.
- [default: 30]
+ --timeout Timeout for downloading json-schemas on URLs and for
+ 'dynenum' lookups on URLs. [default: 30]
Common options:
-h, --help Display this message
@@ -124,6 +130,7 @@ use std::{
env,
fs::File,
io::{BufReader, BufWriter, Read, Write},
+ iter::once,
str,
sync::{
atomic::{AtomicU16, Ordering},
@@ -131,12 +138,17 @@ use std::{
},
};
+use ahash::{HashSet, HashSetExt};
use csv::ByteRecord;
use indicatif::HumanCount;
#[cfg(any(feature = "feature_capable", feature = "lite"))]
use indicatif::{ProgressBar, ProgressDrawTarget};
use itertools::Itertools;
-use jsonschema::{output::BasicOutput, paths::PathChunk, Validator};
+use jsonschema::{
+ output::BasicOutput,
+ paths::{JsonPointer, JsonPointerNode, PathChunk},
+ ErrorIterator, Keyword, ValidationError, Validator,
+};
use log::{debug, info, log_enabled};
use qsv_currency::Currency;
use rayon::{
@@ -145,10 +157,11 @@ use rayon::{
};
use serde::{Deserialize, Serialize};
use serde_json::{json, value::Number, Map, Value};
+use tempfile::NamedTempFile;
use crate::{
config::{Config, Delimiter, DEFAULT_WTR_BUFFER_CAPACITY},
- util, CliResult,
+ util, CliError, CliResult,
};
// to save on repeated init/allocs
@@ -156,6 +169,21 @@ static NULL_TYPE: OnceLock = OnceLock::new();
static TIMEOUT_SECS: AtomicU16 = AtomicU16::new(30);
+/// write to stderr and log::error, using ValidationError
+macro_rules! fail_validation_error {
+ ($($t:tt)*) => {{
+ use log::error;
+ let err = format!($($t)*);
+ error!("{err}");
+ Err(ValidationError::custom(
+ JsonPointer::default(),
+ JsonPointer::default(),
+ &Value::Null,
+ format!("{}", $($t)*),
+ ))
+ }};
+}
+
#[derive(Deserialize)]
#[allow(dead_code)]
struct Args {
@@ -195,6 +223,12 @@ struct RFC4180Struct {
fields: Vec,
}
+impl From> for CliError {
+ fn from(err: ValidationError) -> CliError {
+ CliError::Other(format!("{err}"))
+ }
+}
+
#[inline]
/// Checks if a given string represents a valid currency format.
fn currency_format_checker(s: &str) -> bool {
@@ -207,6 +241,116 @@ fn currency_format_checker(s: &str) -> bool {
})
}
+#[derive(Debug)]
+struct DynEnumValidator {
+ dynenum_set: HashSet,
+}
+
+impl DynEnumValidator {
+ #[allow(dead_code)]
+ const fn new(dynenum_set: HashSet) -> Self {
+ Self { dynenum_set }
+ }
+}
+
+impl Keyword for DynEnumValidator {
+ fn validate<'instance>(
+ &self,
+ instance: &'instance Value,
+ instance_path: &JsonPointerNode,
+ ) -> ErrorIterator<'instance> {
+ if self.dynenum_set.contains(instance.as_str().unwrap()) {
+ Box::new(std::iter::empty())
+ } else {
+ let error = ValidationError::custom(
+ JsonPointer::default(),
+ instance_path.into(),
+ instance,
+ "Value must be a valid dynamic enum",
+ );
+ Box::new(once(error))
+ }
+ }
+
+ fn is_valid(&self, instance: &Value) -> bool {
+ if let Value::String(s) = instance {
+ self.dynenum_set.contains(s)
+ } else {
+ false
+ }
+ }
+}
+
+#[allow(dead_code)]
+fn dyn_enum_validator_factory<'a>(
+ _parent: &'a Map,
+ value: &'a Value,
+ jsonpointer: JsonPointer,
+) -> Result, ValidationError<'a>> {
+ if let Value::String(uri) = value {
+ let temp_download = NamedTempFile::new()?;
+
+ let dynenum_path = if uri.starts_with("http") {
+ let valid_url = reqwest::Url::parse(uri)?;
+
+ // download the CSV file from the URL
+ let download_timeout = TIMEOUT_SECS.load(Ordering::Relaxed);
+ let future = util::download_file(
+ valid_url.as_str(),
+ temp_download.path().to_path_buf(),
+ false,
+ None,
+ Some(download_timeout),
+ None,
+ );
+ if let Err(e) = tokio::runtime::Runtime::new()?.block_on(future) {
+ return fail_validation_error!("Error downloading dynenum file: {e}");
+ }
+
+ temp_download.path().to_str().unwrap().to_string()
+ } else {
+ // its a local file
+ let uri_path = std::path::Path::new(uri);
+ let uri_exists = uri_path.exists();
+ if !uri_exists {
+ return fail_validation_error!("dynenum file not found: {uri}");
+ }
+ uri_path.to_str().unwrap().to_string()
+ };
+
+ // read the first column into a HashSet
+ let mut enum_set = HashSet::new();
+
+ let rconfig = Config::new(&Some(dynenum_path));
+ let mut rdr = rconfig.reader()?;
+ let mut record = csv::StringRecord::with_capacity(500, 1);
+
+ 'dynenum_loop: loop {
+ let result = rdr.read_record(&mut record);
+ if let Err(e) = result {
+ return fail_validation_error!("Error reading dynenum file: {e}");
+ }
+
+ if result.is_ok_and(|more_data| !more_data) {
+ break 'dynenum_loop;
+ }
+
+ if let Some(value) = record.get(0) {
+ enum_set.insert(value.to_string());
+ }
+ }
+
+ Ok(Box::new(DynEnumValidator::new(enum_set)))
+ } else {
+ Err(ValidationError::custom(
+ JsonPointer::default(),
+ jsonpointer,
+ value,
+ "The 'dynenum' keyword must be set to to a CSV on the local filesystem or on a URL.",
+ ))
+ }
+}
+
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
@@ -468,6 +612,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
// compile JSON Schema
match Validator::options()
.with_format("currency", currency_format_checker)
+ .with_keyword("dynenum", dyn_enum_validator_factory)
.should_validate_formats(true)
.build(&json)
{
@@ -1183,6 +1328,7 @@ fn test_validate_currency_validator() {
let compiled_schema = Validator::options()
.with_format("currency", currency_format_checker)
+ .with_keyword("dynenum", dyn_enum_validator_factory)
.should_validate_formats(true)
.build(&schema_currency_json())
.expect("Invalid schema");
@@ -1216,6 +1362,7 @@ fn test_validate_currency_validator() {
let compiled_schema = Validator::options()
.with_format("currency", currency_format_checker)
+ .with_keyword("dynenum", dyn_enum_validator_factory)
.should_validate_formats(true)
.build(&schema_currency_json())
.expect("Invalid schema");
@@ -1286,3 +1433,32 @@ fn test_load_json_via_url() {
serde_json::from_str(&json_string_result.unwrap());
assert!(&json_result.is_ok());
}
+
+#[test]
+fn test_dyn_enum_validator() {
+ let schema = json!({"dynenum": "https://raw.githubusercontent.com/jqnatividad/qsv/refs/heads/master/resources/test/fruits.csv", "type": "string"});
+ let validator = jsonschema::options()
+ .with_keyword("dynenum", dyn_enum_validator_factory)
+ .build(&schema)
+ .unwrap();
+
+ assert!(validator.is_valid(&json!("banana")));
+ assert!(validator.is_valid(&json!("strawberry")));
+ assert!(validator.is_valid(&json!("apple")));
+ assert!(!validator.is_valid(&json!("Apple")));
+ assert!(!validator.is_valid(&json!("starapple")));
+ assert!(!validator.is_valid(&json!("bananana")));
+ assert!(!validator.is_valid(&json!("")));
+ assert!(!validator.is_valid(&json!(5)));
+ if let Err(e) = validator.validate(&json!("lanzones")) {
+ let err_info = e.into_iter().next().unwrap();
+ assert_eq!(
+ format!("{err_info:?}"),
+ "ValidationError { instance: String(\"lanzones\"), kind: Custom { message: \"Value \
+ must be a valid dynamic enum\" }, instance_path: JsonPointer([]), schema_path: \
+ JsonPointer([]) }"
+ );
+ } else {
+ unreachable!("Expected an error, but validation succeeded.");
+ };
+}