Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: make chunk size user defined #388

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions benches/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ use std::time::Duration;
// https://bheisler.github.io/criterion.rs/book/analysis.html#measurement
const SAMPLE_SIZE: usize = 20;

/// The maximum size (before compression) of an individual chunk of a file, defined as 1024kiB.
const MAX_CHUNK_SIZE: usize = 1024 * 1024;
/// The minimum size (before compression) of an individual chunk of a file, defined as 1B.
const MIN_CHUNK_SIZE: usize = 1;

fn custom_criterion() -> Criterion {
Criterion::default()
.measurement_time(Duration::from_secs(40))
Expand All @@ -63,7 +68,8 @@ fn write(b: &mut Bencher<'_>, bytes_len: usize) {
|| random_bytes(bytes_len),
// actual benchmark
|bytes| {
let (_data_map, _encrypted_chunks) = encrypt(bytes).unwrap();
let (_data_map, _encrypted_chunks) =
encrypt(bytes, MIN_CHUNK_SIZE, MAX_CHUNK_SIZE).unwrap();
},
BatchSize::SmallInput,
);
Expand All @@ -72,7 +78,7 @@ fn write(b: &mut Bencher<'_>, bytes_len: usize) {
fn read(b: &mut Bencher, bytes_len: usize) {
b.iter_batched(
// the setup
|| encrypt(random_bytes(bytes_len)).unwrap(),
|| encrypt(random_bytes(bytes_len), MIN_CHUNK_SIZE, MAX_CHUNK_SIZE).unwrap(),
// actual benchmark
|(data_map, encrypted_chunks)| {
let _raw_data = decrypt_full_set(&data_map, &encrypted_chunks).unwrap();
Expand Down
8 changes: 7 additions & 1 deletion examples/basic_encryptor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ fn file_name(name: XorName) -> String {
string
}

/// The maximum size (before compression) of an individual chunk of a file, defined as 1024kiB.
const MAX_CHUNK_SIZE: usize = 1024 * 1024;
/// The minimum size (before compression) of an individual chunk of a file, defined as 1B.
const MIN_CHUNK_SIZE: usize = 1;

#[derive(Clone)]
struct DiskBasedStorage {
pub(crate) storage_path: String,
Expand Down Expand Up @@ -147,7 +152,8 @@ async fn main() {
Err(error) => return println!("{}", error),
}

let (data_map, encrypted_chunks) = encrypt(Bytes::from(data)).unwrap();
let (data_map, encrypted_chunks) =
encrypt(Bytes::from(data), MIN_CHUNK_SIZE, MAX_CHUNK_SIZE).unwrap();

let result = encrypted_chunks
.par_iter()
Expand Down
21 changes: 15 additions & 6 deletions src/chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,20 @@ pub struct RawChunk {

/// Hash all the chunks.
/// Creates [num cores] batches.
pub(crate) fn batch_chunks(bytes: Bytes) -> (usize, Vec<EncryptionBatch>) {
pub(crate) fn batch_chunks(
bytes: Bytes,
min_chunk_size: usize,
max_chunk_size: usize,
) -> (usize, Vec<EncryptionBatch>) {
let data_size = bytes.len();
let num_chunks = get_num_chunks(data_size);
let num_chunks = get_num_chunks(data_size, min_chunk_size, max_chunk_size);

let raw_chunks: Vec<_> = (0..num_chunks)
.map(|index| (index, bytes.clone()))
.par_bridge()
.map(|(index, bytes)| {
let (start, end) = get_start_end_positions(data_size, index);
let (start, end) =
get_start_end_positions(data_size, index, min_chunk_size, max_chunk_size);
let data = bytes.slice(start..end);
let hash = XorName::from_content(data.as_ref());
RawChunk { index, data, hash }
Expand All @@ -63,10 +68,14 @@ pub(crate) fn batch_chunks(bytes: Bytes) -> (usize, Vec<EncryptionBatch>) {
}

/// Calculate (start_position, end_position) for each chunk for the input file size
pub(crate) fn batch_positions(data_size: usize) -> Vec<(usize, usize)> {
let num_chunks = get_num_chunks(data_size);
pub(crate) fn batch_positions(
data_size: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> Vec<(usize, usize)> {
let num_chunks = get_num_chunks(data_size, min_chunk_size, max_chunk_size);

(0..num_chunks)
.map(|index| get_start_end_positions(data_size, index))
.map(|index| get_start_end_positions(data_size, index, min_chunk_size, max_chunk_size))
.collect()
}
2 changes: 1 addition & 1 deletion src/data_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use xor_name::XorName;

/// Holds the information that is required to recover the content of the encrypted file.
/// This is held as a vector of `ChunkInfo`, i.e. a list of the file's chunk hashes.
/// Only files larger than 3072 bytes (3 * MIN_CHUNK_SIZE) can be self-encrypted.
/// Only files larger than 3072 bytes (3 * chunk size) can be self-encrypted.
/// Smaller files will have to be batched together.
#[derive(Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct DataMap(Vec<ChunkInfo>);
Expand Down
163 changes: 105 additions & 58 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@
//! async fn main() {
//! let file_size = 10_000_000;
//! let bytes = random_bytes(file_size);
//!
//! if let Ok((_data_map, _encrypted_chunks)) = encrypt(bytes) {
//! const MAX_CHUNK_SIZE: usize = 1024 * 1024;
//! const MIN_CHUNK_SIZE: usize = 1;
//! if let Ok((_data_map, _encrypted_chunks)) = encrypt(bytes, MIN_CHUNK_SIZE, MAX_CHUNK_SIZE) {
//! // .. then persist the `encrypted_chunks`.
//! // Remember to keep `data_map` somewhere safe..!
//! }
Expand Down Expand Up @@ -123,12 +124,6 @@ use xor_name::XorName;
pub use bytes;
pub use xor_name;

/// The minimum size (before compression) of data to be self-encrypted, defined as 3B.
pub const MIN_ENCRYPTABLE_BYTES: usize = 3 * MIN_CHUNK_SIZE;
/// The maximum size (before compression) of an individual chunk of a file, defined as 500kiB.
pub const MAX_CHUNK_SIZE: usize = 512 * 1024;
/// The minimum size (before compression) of an individual chunk of a file, defined as 1B.
pub const MIN_CHUNK_SIZE: usize = 1;
/// Controls the compression-speed vs compression-density tradeoffs. The higher the quality, the
/// slower the compression. Range is 0 to 11.
pub const COMPRESSION_QUALITY: i32 = 6;
Expand Down Expand Up @@ -163,12 +158,17 @@ pub struct StreamSelfEncryptor {
impl StreamSelfEncryptor {
/// For encryption, return with an intialized streaming encryptor.
/// If a `chunk_dir` is provided, the encrypted_chunks will be written into the specified dir as well.
pub fn encrypt_from_file(file_path: PathBuf, chunk_dir: Option<PathBuf>) -> Result<Self> {
pub fn encrypt_from_file(
file_path: PathBuf,
chunk_dir: Option<PathBuf>,
min_chunk_size: usize,
max_chunk_size: usize,
) -> Result<Self> {
let file = File::open(&*file_path)?;
let metadata = file.metadata()?;
let file_size = metadata.len();

let batch_positions = batch_positions(file_size as usize);
let batch_positions = batch_positions(file_size as usize, min_chunk_size, max_chunk_size);

Ok(StreamSelfEncryptor {
file_path,
Expand Down Expand Up @@ -350,13 +350,18 @@ impl StreamSelfDecryptor {
}

/// Read a file from the disk to encrypt, and output the chunks to a given output directory if presents.
pub fn encrypt_from_file(file_path: &Path, output_dir: &Path) -> Result<(DataMap, Vec<XorName>)> {
pub fn encrypt_from_file(
file_path: &Path,
output_dir: &Path,
min_chunk_size: usize,
max_chunk_size: usize,
) -> Result<(DataMap, Vec<XorName>)> {
let mut file = File::open(file_path)?;
let mut bytes = Vec::new();
let _ = file.read_to_end(&mut bytes)?;
let bytes = Bytes::from(bytes);

let (data_map, encrypted_chunks) = encrypt(bytes)?;
let (data_map, encrypted_chunks) = encrypt(bytes, min_chunk_size, max_chunk_size)?;

let mut chunk_names = Vec::new();
for chunk in encrypted_chunks {
Expand Down Expand Up @@ -401,16 +406,21 @@ pub fn decrypt_from_chunk_files(
/// Encrypts a set of bytes and returns the encrypted data together with
/// the data map that is derived from the input data, and is used to later decrypt the encrypted data.
/// Returns an error if the size is too small for self-encryption.
/// Only files larger than 3072 bytes (3 * MIN_CHUNK_SIZE) can be self-encrypted.
/// Only files larger than 3072 bytes (3 * min_chunk_size) can be self-encrypted.
/// Smaller files will have to be batched together for self-encryption to work.
pub fn encrypt(bytes: Bytes) -> Result<(DataMap, Vec<EncryptedChunk>)> {
if (MIN_ENCRYPTABLE_BYTES) > bytes.len() {
pub fn encrypt(
bytes: Bytes,
min_chunk_size: usize,
max_chunk_size: usize,
) -> Result<(DataMap, Vec<EncryptedChunk>)> {
let min_encryptable_bytes = 3 * min_chunk_size;
if (min_encryptable_bytes) > bytes.len() {
return Err(Error::Generic(format!(
"Too small for self-encryption! Required size at least {}",
MIN_ENCRYPTABLE_BYTES
min_encryptable_bytes
)));
}
let (num_chunks, batches) = chunk::batch_chunks(bytes);
let (num_chunks, batches) = chunk::batch_chunks(bytes, min_chunk_size, max_chunk_size);
let (data_map, encrypted_chunks) = encrypt::encrypt(batches);
if num_chunks > encrypted_chunks.len() {
return Err(Error::Encryption);
Expand Down Expand Up @@ -480,13 +490,20 @@ pub struct SeekInfo {
/// It is used to first fetch chunks using the `index_range`.
/// Then the chunks are passed into `self_encryption::decrypt_range` together
/// with `relative_pos` from the `SeekInfo` instance, and the `len` to be read.
pub fn seek_info(file_size: usize, pos: usize, len: usize) -> SeekInfo {
let (start_index, end_index) = overlapped_chunks(file_size, pos, len);

let relative_pos = if start_index == 2 && file_size < 3 * MAX_CHUNK_SIZE {
pos - (2 * get_chunk_size(file_size, 0))
pub fn seek_info(
file_size: usize,
pos: usize,
len: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> SeekInfo {
let (start_index, end_index) =
overlapped_chunks(file_size, pos, len, min_chunk_size, max_chunk_size);

let relative_pos = if start_index == 2 && file_size < 3 * max_chunk_size {
pos - (2 * get_chunk_size(file_size, 0, min_chunk_size, max_chunk_size))
} else {
pos % get_chunk_size(file_size, start_index)
pos % get_chunk_size(file_size, start_index, min_chunk_size, max_chunk_size)
};

SeekInfo {
Expand All @@ -501,9 +518,15 @@ pub fn seek_info(file_size: usize, pos: usize, len: usize) -> SeekInfo {

/// Returns the chunk index range [start, end) that is overlapped by the byte range defined by `pos`
/// and `len`. Returns empty range if `file_size` is so small that there are no chunks.
fn overlapped_chunks(file_size: usize, pos: usize, len: usize) -> (usize, usize) {
fn overlapped_chunks(
file_size: usize,
pos: usize,
len: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> (usize, usize) {
// FIX THIS SHOULD NOT BE ALLOWED
if file_size < (3 * MIN_CHUNK_SIZE) || pos >= file_size || len == 0 {
if file_size < (3 * min_chunk_size) || pos >= file_size || len == 0 {
return (0, 0);
}

Expand All @@ -513,8 +536,8 @@ fn overlapped_chunks(file_size: usize, pos: usize, len: usize) -> (usize, usize)
None => file_size,
};

let start_index = get_chunk_index(file_size, pos);
let end_index = get_chunk_index(file_size, end);
let start_index = get_chunk_index(file_size, pos, min_chunk_size, max_chunk_size);
let end_index = get_chunk_index(file_size, end, min_chunk_size, max_chunk_size);

(start_index, end_index)
}
Expand Down Expand Up @@ -561,90 +584,114 @@ fn get_pki(src_hash: &XorName, n_1_src_hash: &XorName, n_2_src_hash: &XorName) -
}

// Returns the number of chunks according to file size.
fn get_num_chunks(file_size: usize) -> usize {
if file_size < (3 * MIN_CHUNK_SIZE) {
fn get_num_chunks(file_size: usize, min_chunk_size: usize, max_chunk_size: usize) -> usize {
if file_size < (3 * min_chunk_size) {
return 0;
}
if file_size < (3 * MAX_CHUNK_SIZE) {
if file_size < (3 * max_chunk_size) {
return 3;
}
if file_size % MAX_CHUNK_SIZE == 0 {
file_size / MAX_CHUNK_SIZE
if file_size % max_chunk_size == 0 {
file_size / max_chunk_size
} else {
(file_size / MAX_CHUNK_SIZE) + 1
(file_size / max_chunk_size) + 1
}
}

// Returns the size of a chunk according to file size.
fn get_chunk_size(file_size: usize, chunk_index: usize) -> usize {
if file_size < 3 * MIN_CHUNK_SIZE {
// Returns the size of a chunk according to file size and defined chunk sizes.
fn get_chunk_size(
file_size: usize,
chunk_index: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> usize {
if file_size < 3 * min_chunk_size {
return 0;
}
if file_size < 3 * MAX_CHUNK_SIZE {
if file_size < 3 * max_chunk_size {
if chunk_index < 2 {
return file_size / 3;
} else {
// When the file_size % 3 > 0, the third (last) chunk includes the remainder
return file_size - (2 * (file_size / 3));
}
}
let total_chunks = get_num_chunks(file_size);
let total_chunks = get_num_chunks(file_size, min_chunk_size, max_chunk_size);
if chunk_index < total_chunks - 2 {
return MAX_CHUNK_SIZE;
return max_chunk_size;
}
let remainder = file_size % MAX_CHUNK_SIZE;
let remainder = file_size % max_chunk_size;
let penultimate = (total_chunks - 2) == chunk_index;
if remainder == 0 {
return MAX_CHUNK_SIZE;
return max_chunk_size;
}
if remainder < MIN_CHUNK_SIZE {
if remainder < min_chunk_size {
if penultimate {
MAX_CHUNK_SIZE - MIN_CHUNK_SIZE
max_chunk_size - min_chunk_size
} else {
MIN_CHUNK_SIZE + remainder
min_chunk_size + remainder
}
} else if penultimate {
MAX_CHUNK_SIZE
max_chunk_size
} else {
remainder
}
}

// Returns the [start, end) half-open byte range of a chunk.
fn get_start_end_positions(file_size: usize, chunk_index: usize) -> (usize, usize) {
if get_num_chunks(file_size) == 0 {
fn get_start_end_positions(
file_size: usize,
chunk_index: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> (usize, usize) {
if get_num_chunks(file_size, min_chunk_size, max_chunk_size) == 0 {
return (0, 0);
}
let start = get_start_position(file_size, chunk_index);
(start, start + get_chunk_size(file_size, chunk_index))
let start = get_start_position(file_size, chunk_index, min_chunk_size, max_chunk_size);
(
start,
start + get_chunk_size(file_size, chunk_index, min_chunk_size, max_chunk_size),
)
}

fn get_start_position(file_size: usize, chunk_index: usize) -> usize {
let total_chunks = get_num_chunks(file_size);
fn get_start_position(
file_size: usize,
chunk_index: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> usize {
let total_chunks = get_num_chunks(file_size, min_chunk_size, max_chunk_size);
if total_chunks == 0 {
return 0;
}
let last = (total_chunks - 1) == chunk_index;
let first_chunk_size = get_chunk_size(file_size, 0);
let first_chunk_size = get_chunk_size(file_size, 0, min_chunk_size, max_chunk_size);
if last {
first_chunk_size * (chunk_index - 1) + get_chunk_size(file_size, chunk_index - 1)
first_chunk_size * (chunk_index - 1)
+ get_chunk_size(file_size, chunk_index - 1, min_chunk_size, max_chunk_size)
} else {
first_chunk_size * chunk_index
}
}

fn get_chunk_index(file_size: usize, position: usize) -> usize {
let num_chunks = get_num_chunks(file_size);
fn get_chunk_index(
file_size: usize,
position: usize,
min_chunk_size: usize,
max_chunk_size: usize,
) -> usize {
let num_chunks = get_num_chunks(file_size, min_chunk_size, max_chunk_size);
if num_chunks == 0 {
return 0; // FIX THIS SHOULD NOT BE ALLOWED
}

let chunk_size = get_chunk_size(file_size, 0);
let chunk_size = get_chunk_size(file_size, 0, min_chunk_size, max_chunk_size);
let remainder = file_size % chunk_size;

if remainder == 0
|| remainder >= MIN_CHUNK_SIZE
|| position < file_size - remainder - MIN_CHUNK_SIZE
|| remainder >= min_chunk_size
|| position < file_size - remainder - min_chunk_size
{
usize::min(position / chunk_size, num_chunks - 1)
} else {
Expand Down
Loading
Loading