feat: make chunk size user defined

BEAKING CHANGE: all APIs updated to have min/max_encryptrable bytes passed in by the user. This allows for varying the use of the lib.
maidsafe · Sep 25, 2024 · 2b68ba1 · 2b68ba1
1 parent 7a113a0
commit 2b68ba1
Show file tree

Hide file tree

Showing 7 changed files with 223 additions and 104 deletions.
diff --git a/benches/lib.rs b/benches/lib.rs
@@ -51,6 +51,11 @@ use std::time::Duration;
 // https://bheisler.github.io/criterion.rs/book/analysis.html#measurement
 const SAMPLE_SIZE: usize = 20;
 
+/// The maximum size (before compression) of an individual chunk of a file, defined as 1024kiB.
+const MAX_CHUNK_SIZE: usize = 1024 * 1024;
+/// The minimum size (before compression) of an individual chunk of a file, defined as 1B.
+const MIN_CHUNK_SIZE: usize = 1;
+
 fn custom_criterion() -> Criterion {
     Criterion::default()
         .measurement_time(Duration::from_secs(40))
@@ -63,7 +68,8 @@ fn write(b: &mut Bencher<'_>, bytes_len: usize) {
         || random_bytes(bytes_len),
         // actual benchmark
         |bytes| {
-            let (_data_map, _encrypted_chunks) = encrypt(bytes).unwrap();
+            let (_data_map, _encrypted_chunks) =
+                encrypt(bytes, MIN_CHUNK_SIZE, MAX_CHUNK_SIZE).unwrap();
         },
         BatchSize::SmallInput,
     );
@@ -72,7 +78,7 @@ fn write(b: &mut Bencher<'_>, bytes_len: usize) {
 fn read(b: &mut Bencher, bytes_len: usize) {
     b.iter_batched(
         // the setup
-        || encrypt(random_bytes(bytes_len)).unwrap(),
+        || encrypt(random_bytes(bytes_len), MIN_CHUNK_SIZE, MAX_CHUNK_SIZE).unwrap(),
         // actual benchmark
         |(data_map, encrypted_chunks)| {
             let _raw_data = decrypt_full_set(&data_map, &encrypted_chunks).unwrap();

diff --git a/examples/basic_encryptor.rs b/examples/basic_encryptor.rs
@@ -89,6 +89,11 @@ fn file_name(name: XorName) -> String {
     string
 }
 
+/// The maximum size (before compression) of an individual chunk of a file, defined as 1024kiB.
+const MAX_CHUNK_SIZE: usize = 1024 * 1024;
+/// The minimum size (before compression) of an individual chunk of a file, defined as 1B.
+const MIN_CHUNK_SIZE: usize = 1;
+
 #[derive(Clone)]
 struct DiskBasedStorage {
     pub(crate) storage_path: String,
@@ -147,7 +152,8 @@ async fn main() {
                 Err(error) => return println!("{}", error),
             }
 
-            let (data_map, encrypted_chunks) = encrypt(Bytes::from(data)).unwrap();
+            let (data_map, encrypted_chunks) =
+                encrypt(Bytes::from(data), MIN_CHUNK_SIZE, MAX_CHUNK_SIZE).unwrap();
 
             let result = encrypted_chunks
                 .par_iter()

diff --git a/src/chunk.rs b/src/chunk.rs
@@ -32,15 +32,20 @@ pub struct RawChunk {
 
 /// Hash all the chunks.
 /// Creates [num cores] batches.
-pub(crate) fn batch_chunks(bytes: Bytes) -> (usize, Vec<EncryptionBatch>) {
+pub(crate) fn batch_chunks(
+    bytes: Bytes,
+    min_chunk_size: usize,
+    max_chunk_size: usize,
+) -> (usize, Vec<EncryptionBatch>) {
     let data_size = bytes.len();
-    let num_chunks = get_num_chunks(data_size);
+    let num_chunks = get_num_chunks(data_size, min_chunk_size, max_chunk_size);
 
     let raw_chunks: Vec<_> = (0..num_chunks)
         .map(|index| (index, bytes.clone()))
         .par_bridge()
         .map(|(index, bytes)| {
-            let (start, end) = get_start_end_positions(data_size, index);
+            let (start, end) =
+                get_start_end_positions(data_size, index, min_chunk_size, max_chunk_size);
             let data = bytes.slice(start..end);
             let hash = XorName::from_content(data.as_ref());
             RawChunk { index, data, hash }
@@ -63,10 +68,14 @@ pub(crate) fn batch_chunks(bytes: Bytes) -> (usize, Vec<EncryptionBatch>) {
 }
 
 /// Calculate (start_position, end_position) for each chunk for the input file size
-pub(crate) fn batch_positions(data_size: usize) -> Vec<(usize, usize)> {
-    let num_chunks = get_num_chunks(data_size);
+pub(crate) fn batch_positions(
+    data_size: usize,
+    min_chunk_size: usize,
+    max_chunk_size: usize,
+) -> Vec<(usize, usize)> {
+    let num_chunks = get_num_chunks(data_size, min_chunk_size, max_chunk_size);
 
     (0..num_chunks)
-        .map(|index| get_start_end_positions(data_size, index))
+        .map(|index| get_start_end_positions(data_size, index, min_chunk_size, max_chunk_size))
         .collect()
 }
diff --git a/src/data_map.rs b/src/data_map.rs
@@ -13,7 +13,7 @@ use xor_name::XorName;
 
 /// Holds the information that is required to recover the content of the encrypted file.
 /// This is held as a vector of `ChunkInfo`, i.e. a list of the file's chunk hashes.
-/// Only files larger than 3072 bytes (3 * MIN_CHUNK_SIZE) can be self-encrypted.
+/// Only files larger than 3072 bytes (3 * chunk size) can be self-encrypted.
 /// Smaller files will have to be batched together.
 #[derive(Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Clone)]
 pub struct DataMap(Vec<ChunkInfo>);

diff --git a/src/lib.rs b/src/lib.rs
@@ -123,12 +123,6 @@ use xor_name::XorName;
 pub use bytes;
 pub use xor_name;
 
-/// The minimum size (before compression) of data to be self-encrypted, defined as 3B.
-pub const MIN_ENCRYPTABLE_BYTES: usize = 3 * MIN_CHUNK_SIZE;
-/// The maximum size (before compression) of an individual chunk of a file, defined as 500kiB.
-pub const MAX_CHUNK_SIZE: usize = 512 * 1024;
-/// The minimum size (before compression) of an individual chunk of a file, defined as 1B.
-pub const MIN_CHUNK_SIZE: usize = 1;
 /// Controls the compression-speed vs compression-density tradeoffs.  The higher the quality, the
 /// slower the compression.  Range is 0 to 11.
 pub const COMPRESSION_QUALITY: i32 = 6;
@@ -163,12 +157,17 @@ pub struct StreamSelfEncryptor {
 impl StreamSelfEncryptor {
     /// For encryption, return with an intialized streaming encryptor.
     /// If a `chunk_dir` is provided, the encrypted_chunks will be written into the specified dir as well.
-    pub fn encrypt_from_file(file_path: PathBuf, chunk_dir: Option<PathBuf>) -> Result<Self> {
+    pub fn encrypt_from_file(
+        file_path: PathBuf,
+        chunk_dir: Option<PathBuf>,
+        min_chunk_size: usize,
+        max_chunk_size: usize,
+    ) -> Result<Self> {
         let file = File::open(&*file_path)?;
         let metadata = file.metadata()?;
         let file_size = metadata.len();
 
-        let batch_positions = batch_positions(file_size as usize);
+        let batch_positions = batch_positions(file_size as usize, min_chunk_size, max_chunk_size);
 
         Ok(StreamSelfEncryptor {
             file_path,
@@ -350,13 +349,18 @@ impl StreamSelfDecryptor {
 }
 
 /// Read a file from the disk to encrypt, and output the chunks to a given output directory if presents.
-pub fn encrypt_from_file(file_path: &Path, output_dir: &Path) -> Result<(DataMap, Vec<XorName>)> {
+pub fn encrypt_from_file(
+    file_path: &Path,
+    output_dir: &Path,
+    min_chunk_size: usize,
+    max_chunk_size: usize,
+) -> Result<(DataMap, Vec<XorName>)> {
     let mut file = File::open(file_path)?;
     let mut bytes = Vec::new();
     let _ = file.read_to_end(&mut bytes)?;
     let bytes = Bytes::from(bytes);
 
-    let (data_map, encrypted_chunks) = encrypt(bytes)?;
+    let (data_map, encrypted_chunks) = encrypt(bytes, min_chunk_size, max_chunk_size)?;
 
     let mut chunk_names = Vec::new();
     for chunk in encrypted_chunks {
@@ -401,16 +405,21 @@ pub fn decrypt_from_chunk_files(
 /// Encrypts a set of bytes and returns the encrypted data together with
 /// the data map that is derived from the input data, and is used to later decrypt the encrypted data.
 /// Returns an error if the size is too small for self-encryption.
-/// Only files larger than 3072 bytes (3 * MIN_CHUNK_SIZE) can be self-encrypted.
+/// Only files larger than 3072 bytes (3 * min_chunk_size) can be self-encrypted.
 /// Smaller files will have to be batched together for self-encryption to work.
-pub fn encrypt(bytes: Bytes) -> Result<(DataMap, Vec<EncryptedChunk>)> {
-    if (MIN_ENCRYPTABLE_BYTES) > bytes.len() {
+pub fn encrypt(
+    bytes: Bytes,
+    min_chunk_size: usize,
+    max_chunk_size: usize,
+) -> Result<(DataMap, Vec<EncryptedChunk>)> {
+    let min_encryptable_bytes = 3 * min_chunk_size;
+    if (min_encryptable_bytes) > bytes.len() {
         return Err(Error::Generic(format!(
             "Too small for self-encryption! Required size at least {}",
-            MIN_ENCRYPTABLE_BYTES
+            min_encryptable_bytes
         )));
     }
-    let (num_chunks, batches) = chunk::batch_chunks(bytes);
+    let (num_chunks, batches) = chunk::batch_chunks(bytes, min_chunk_size, max_chunk_size);
     let (data_map, encrypted_chunks) = encrypt::encrypt(batches);
     if num_chunks > encrypted_chunks.len() {
         return Err(Error::Encryption);
@@ -480,13 +489,20 @@ pub struct SeekInfo {
 /// It is used to first fetch chunks using the `index_range`.
 /// Then the chunks are passed into `self_encryption::decrypt_range` together
 /// with `relative_pos` from the `SeekInfo` instance, and the `len` to be read.
-pub fn seek_info(file_size: usize, pos: usize, len: usize) -> SeekInfo {
-    let (start_index, end_index) = overlapped_chunks(file_size, pos, len);
-
-    let relative_pos = if start_index == 2 && file_size < 3 * MAX_CHUNK_SIZE {
-        pos - (2 * get_chunk_size(file_size, 0))
+pub fn seek_info(
+    file_size: usize,
+    pos: usize,
+    len: usize,
+    min_chunk_size: usize,
+    max_chunk_size: usize,
+) -> SeekInfo {
+    let (start_index, end_index) =
+        overlapped_chunks(file_size, pos, len, min_chunk_size, max_chunk_size);
+
+    let relative_pos = if start_index == 2 && file_size < 3 * max_chunk_size {
+        pos - (2 * get_chunk_size(file_size, 0, min_chunk_size, max_chunk_size))
     } else {
-        pos % get_chunk_size(file_size, start_index)
+        pos % get_chunk_size(file_size, start_index, min_chunk_size, max_chunk_size)
     };
 
     SeekInfo {
@@ -501,9 +517,15 @@ pub fn seek_info(file_size: usize, pos: usize, len: usize) -> SeekInfo {
 
 /// Returns the chunk index range [start, end) that is overlapped by the byte range defined by `pos`
 /// and `len`. Returns empty range if `file_size` is so small that there are no chunks.
-fn overlapped_chunks(file_size: usize, pos: usize, len: usize) -> (usize, usize) {
+fn overlapped_chunks(
+    file_size: usize,
+    pos: usize,
+    len: usize,
+    min_chunk_size: usize,
+    max_chunk_size: usize,
+) -> (usize, usize) {
     // FIX THIS SHOULD NOT BE ALLOWED
-    if file_size < (3 * MIN_CHUNK_SIZE) || pos >= file_size || len == 0 {
+    if file_size < (3 * min_chunk_size) || pos >= file_size || len == 0 {
         return (0, 0);
     }
 
@@ -513,8 +535,8 @@ fn overlapped_chunks(file_size: usize, pos: usize, len: usize) -> (usize, usize)
         None => file_size,
     };
 
-    let start_index = get_chunk_index(file_size, pos);
-    let end_index = get_chunk_index(file_size, end);
+    let start_index = get_chunk_index(file_size, pos, min_chunk_size, max_chunk_size);
+    let end_index = get_chunk_index(file_size, end, min_chunk_size, max_chunk_size);
 
     (start_index, end_index)
 }
@@ -561,90 +583,114 @@ fn get_pki(src_hash: &XorName, n_1_src_hash: &XorName, n_2_src_hash: &XorName) -
 }
 
 // Returns the number of chunks according to file size.
-fn get_num_chunks(file_size: usize) -> usize {
-    if file_size < (3 * MIN_CHUNK_SIZE) {
+fn get_num_chunks(file_size: usize, min_chunk_size: usize, max_chunk_size: usize) -> usize {
+    if file_size < (3 * min_chunk_size) {
         return 0;
     }
-    if file_size < (3 * MAX_CHUNK_SIZE) {
+    if file_size < (3 * max_chunk_size) {
         return 3;
     }
-    if file_size % MAX_CHUNK_SIZE == 0 {
-        file_size / MAX_CHUNK_SIZE
+    if file_size % max_chunk_size == 0 {
+        file_size / max_chunk_size
     } else {
-        (file_size / MAX_CHUNK_SIZE) + 1
+        (file_size / max_chunk_size) + 1
     }
 }
 
-// Returns the size of a chunk according to file size.
-fn get_chunk_size(file_size: usize, chunk_index: usize) -> usize {
-    if file_size < 3 * MIN_CHUNK_SIZE {
+// Returns the size of a chunk according to file size and defined chunk sizes.
+fn get_chunk_size(
+    file_size: usize,
+    chunk_index: usize,
+    min_chunk_size: usize,
+    max_chunk_size: usize,
+) -> usize {
+    if file_size < 3 * min_chunk_size {
         return 0;
     }
-    if file_size < 3 * MAX_CHUNK_SIZE {
+    if file_size < 3 * max_chunk_size {
         if chunk_index < 2 {
             return file_size / 3;
         } else {
             // When the file_size % 3 > 0, the third (last) chunk includes the remainder
             return file_size - (2 * (file_size / 3));
         }
     }
-    let total_chunks = get_num_chunks(file_size);
+    let total_chunks = get_num_chunks(file_size, min_chunk_size, max_chunk_size);
     if chunk_index < total_chunks - 2 {
-        return MAX_CHUNK_SIZE;
+        return max_chunk_size;
     }
-    let remainder = file_size % MAX_CHUNK_SIZE;
+    let remainder = file_size % max_chunk_size;
     let penultimate = (total_chunks - 2) == chunk_index;
     if remainder == 0 {
-        return MAX_CHUNK_SIZE;
+        return max_chunk_size;
     }
-    if remainder < MIN_CHUNK_SIZE {
+    if remainder < min_chunk_size {
         if penultimate {
-            MAX_CHUNK_SIZE - MIN_CHUNK_SIZE
+            max_chunk_size - min_chunk_size
         } else {
-            MIN_CHUNK_SIZE + remainder
+            min_chunk_size + remainder
         }
     } else if penultimate {
-        MAX_CHUNK_SIZE
+        max_chunk_size
     } else {
         remainder
     }
 }
 
 // Returns the [start, end) half-open byte range of a chunk.
-fn get_start_end_positions(file_size: usize, chunk_index: usize) -> (usize, usize) {
-    if get_num_chunks(file_size) == 0 {
+fn get_start_end_positions(
+    file_size: usize,
+    chunk_index: usize,
+    min_chunk_size: usize,
+    max_chunk_size: usize,
+) -> (usize, usize) {
+    if get_num_chunks(file_size, min_chunk_size, max_chunk_size) == 0 {
         return (0, 0);
     }
-    let start = get_start_position(file_size, chunk_index);
-    (start, start + get_chunk_size(file_size, chunk_index))
+    let start = get_start_position(file_size, chunk_index, min_chunk_size, max_chunk_size);
+    (
+        start,
+        start + get_chunk_size(file_size, chunk_index, min_chunk_size, max_chunk_size),
+    )
 }
 
-fn get_start_position(file_size: usize, chunk_index: usize) -> usize {
-    let total_chunks = get_num_chunks(file_size);
+fn get_start_position(
+    file_size: usize,
+    chunk_index: usize,
+    min_chunk_size: usize,
+    max_chunk_size: usize,
+) -> usize {
+    let total_chunks = get_num_chunks(file_size, min_chunk_size, max_chunk_size);
     if total_chunks == 0 {
         return 0;
     }
     let last = (total_chunks - 1) == chunk_index;
-    let first_chunk_size = get_chunk_size(file_size, 0);
+    let first_chunk_size = get_chunk_size(file_size, 0, min_chunk_size, max_chunk_size);
     if last {
-        first_chunk_size * (chunk_index - 1) + get_chunk_size(file_size, chunk_index - 1)
+        first_chunk_size * (chunk_index - 1)
+            + get_chunk_size(file_size, chunk_index - 1, min_chunk_size, max_chunk_size)
     } else {
         first_chunk_size * chunk_index
     }
 }
 
-fn get_chunk_index(file_size: usize, position: usize) -> usize {
-    let num_chunks = get_num_chunks(file_size);
+fn get_chunk_index(
+    file_size: usize,
+    position: usize,
+    min_chunk_size: usize,
+    max_chunk_size: usize,
+) -> usize {
+    let num_chunks = get_num_chunks(file_size, min_chunk_size, max_chunk_size);
     if num_chunks == 0 {
         return 0; // FIX THIS SHOULD NOT BE ALLOWED
     }
 
-    let chunk_size = get_chunk_size(file_size, 0);
+    let chunk_size = get_chunk_size(file_size, 0, min_chunk_size, max_chunk_size);
     let remainder = file_size % chunk_size;
 
     if remainder == 0
-        || remainder >= MIN_CHUNK_SIZE
-        || position < file_size - remainder - MIN_CHUNK_SIZE
+        || remainder >= min_chunk_size
+        || position < file_size - remainder - min_chunk_size
     {
         usize::min(position / chunk_size, num_chunks - 1)
     } else {