diff --git a/Cargo.lock b/Cargo.lock index 2db37714e..38a206c69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -781,6 +781,12 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "base62" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f879ef8fc74665ed7f0e6127cb106315888fc2744f68e14b74f83edbb2a08992" + [[package]] name = "base64" version = "0.22.1" @@ -5047,6 +5053,7 @@ dependencies = [ "arboard", "assert-json-diff", "atoi_simd 0.16.0", + "base62", "bincode", "byteorder", "bytes", diff --git a/Cargo.toml b/Cargo.toml index 168ea7a43..93db93d14 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,6 +72,7 @@ ahash = "0.8" anyhow = { version = "1.0", optional = true } arboard = "3.4.0" atoi_simd = "0.16" +base62 = { version = "2.0", optional = true } bincode = "1.3" byteorder = "1.5" bytes = "1" @@ -279,6 +280,7 @@ distrib_features = [ ] all_features = ["distrib_features", "self_update"] apply = [ + "base62", "censor", "cpc", "data-encoding", diff --git a/src/cmd/apply.rs b/src/cmd/apply.rs index a87aac68e..3505998b8 100644 --- a/src/cmd/apply.rs +++ b/src/cmd/apply.rs @@ -28,7 +28,7 @@ number of transformed columns with the --rename option is the same. e.g.: $ qsv apply operations trim,upper col1,col2,col3 -r newcol1,newcol2,newcol3 file.csv -It has 36 supported operations: +It has 38 supported operations: * len: Return string length * lower: Transform to lowercase @@ -44,6 +44,8 @@ It has 36 supported operations: * strip_prefix: Removes specified prefix in --comparand * strip_suffix: Remove specified suffix in --comparand * escape - escape (Rust escape_default) + * encode62: base62 encode + * decode62: base62 decode * encode64: base64 encode * decode64: base64 decode * replace: Replace all matches of a pattern (using --comparand) @@ -128,10 +130,9 @@ Convert the USD_Price to PHP_Price using the currency symbol "PHP" with a conver $ qsv apply operations numtocurrency USD_Price -C PHP -R 60 -c PHP_Price file.csv -Base64 encode the plaintext_col column and save the encoded value into new column named encoded_col -and then decode it. +Base64 encode the text_col column & save the encoded value into new column named encoded & decode it. - $ qsv apply operations encode plaintext_col -c encoded_col file.csv | qsv apply operations decode encode_col + $ qsv apply operations encode64 text_col -c encoded file.csv | qsv apply operations decode64 encoded Compute the Normalized Damerau-Levenshtein similarity of the neighborhood column to the string 'Roxbury' and save it to a new column named dln_roxbury_score. @@ -299,6 +300,7 @@ Common options: use std::{str::FromStr, sync::OnceLock}; +use base62; use censor::{Censor, Sex, Zealous}; use cpc::{eval, units::Unit}; use data_encoding::BASE64; @@ -344,8 +346,10 @@ enum Operations { Censor_Count, Copy, Currencytonum, - Decode, - Encode, + Decode62, + Decode64, + Encode62, + Encode64, Escape, Eudex, Gender_Guess, @@ -998,16 +1002,28 @@ fn apply_operations( Operations::Mrtrim => { *cell = String::from(cell.trim_end_matches(comparand)); }, - Operations::Encode => { + Operations::Encode64 => { *cell = BASE64.encode(cell.as_bytes()); }, - Operations::Decode => { + Operations::Decode64 => { let mut output = vec![0; BASE64.decode_len(cell.len()).unwrap_or_default()]; *cell = match BASE64.decode_mut(cell.as_bytes(), &mut output) { Ok(len) => simdutf8::basic::from_utf8(&output[0..len]) .unwrap_or_default() .to_owned(), - Err(e) => format!("decoding error: {e:?}"), + Err(e) => format!("decoding64 error: {e:?}"), + }; + }, + Operations::Encode62 => { + *cell = match cell.parse::() { + Ok(num) => base62::encode(num), + Err(e) => format!("encode62 error: {e:?}"), + }; + }, + Operations::Decode62 => { + *cell = match base62::decode(cell.as_str()) { + Ok(decoded) => decoded.to_string(), + Err(e) => format!("decode62 error: {e:?}"), }; }, Operations::Gender_Guess => { diff --git a/tests/test_apply.rs b/tests/test_apply.rs index f537c8150..795bfd77c 100644 --- a/tests/test_apply.rs +++ b/tests/test_apply.rs @@ -225,7 +225,84 @@ fn apply_ops_upper_index_params() { } #[test] -fn apply_ops_encode() { +fn apply_ops_encode62() { + let wrk = Workdir::new("apply"); + wrk.create( + "data.csv", + vec![ + svec!["hash", "encoded_hash"], + svec!["3824660653605227303", "4YWz9bdwJXT"], + svec!["1851770582521928574", "2Cn7zXlsIM2"], + svec!["7916590694040213670", "9Qo4IeFuLyQ"], + svec!["10903434754618017012", "CzRqVIpxoUG"], + svec!["7671262618974725285", "98gSmJMD1XB"], + ], + ); + let mut cmd = wrk.command("apply"); + cmd.arg("operations") + .arg("decode62") + .arg("2") + .arg("--new-column") + .arg("decoded_hash") + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["hash", "encoded_hash", "decoded_hash"], + svec!["3824660653605227303", "4YWz9bdwJXT", "3824660653605227303"], + svec!["1851770582521928574", "2Cn7zXlsIM2", "1851770582521928574"], + svec!["7916590694040213670", "9Qo4IeFuLyQ", "7916590694040213670"], + svec![ + "10903434754618017012", + "CzRqVIpxoUG", + "10903434754618017012" + ], + svec!["7671262618974725285", "98gSmJMD1XB", "7671262618974725285"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn apply_ops_decode62() { + let wrk = Workdir::new("apply"); + wrk.create( + "data.csv", + vec![ + svec!["hash"], + svec!["3824660653605227303"], + svec!["1851770582521928574"], + svec!["7916590694040213670"], + svec!["10903434754618017012"], + svec!["7671262618974725285"], + svec!["this should cause an error"], + ], + ); + let mut cmd = wrk.command("apply"); + cmd.arg("operations") + .arg("encode62") + .arg("1") + .arg("--new-column") + .arg("encoded_hash") + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["hash", "encoded_hash"], + svec!["3824660653605227303", "4YWz9bdwJXT"], + svec!["1851770582521928574", "2Cn7zXlsIM2"], + svec!["7916590694040213670", "9Qo4IeFuLyQ"], + svec!["10903434754618017012", "CzRqVIpxoUG"], + svec!["7671262618974725285", "98gSmJMD1XB"], + svec![ + "this should cause an error", + "encode62 error: ParseIntError { kind: InvalidDigit }" + ], + ]; + assert_eq!(got, expected); +} + +#[test] +fn apply_ops_encode64() { let wrk = Workdir::new("apply"); wrk.create( "data.csv", @@ -252,7 +329,7 @@ fn apply_ops_encode() { ); let mut cmd = wrk.command("apply"); cmd.arg("operations") - .arg("encode") + .arg("encode64") .arg("surname") .arg("--new-column") .arg("encoded_surname") @@ -277,7 +354,7 @@ fn apply_ops_encode() { } #[test] -fn apply_ops_decode() { +fn apply_ops_decode64() { let wrk = Workdir::new("apply"); wrk.create( "data.csv", @@ -297,7 +374,7 @@ fn apply_ops_decode() { ); let mut cmd = wrk.command("apply"); cmd.arg("operations") - .arg("decode") + .arg("decode64") .arg("encoded_surname") .arg("--new-column") .arg("decoded_surname")