From e2dba083e16891f7b28632b4eda85b8b55ab9def Mon Sep 17 00:00:00 2001 From: LTLA Date: Mon, 27 Nov 2023 12:30:18 -0800 Subject: [PATCH] Extend the optimizer so that it works for sparse arrays. This includes the usual sparse Matrix classes, SVT_SparseMatrix objects, and sparse block processing of everything else. --- R/optimize_storage.R | 168 +++++++++++++++++++------ tests/testthat/test-optimize_storage.R | 52 ++++++++ 2 files changed, 183 insertions(+), 37 deletions(-) diff --git a/R/optimize_storage.R b/R/optimize_storage.R index f28fdcf..fd179a1 100644 --- a/R/optimize_storage.R +++ b/R/optimize_storage.R @@ -28,6 +28,10 @@ aggregate_max <- function(collated, name) { max(unlist(lapply(collated, function(y) y[[name]])), na.rm=TRUE) } +aggregate_sum <- function(collated, name) { + sum(vapply(collated, function(y) y[[name]], 0L)) +} + ################################################### ################################################### @@ -40,14 +44,42 @@ setGeneric("collect_integer_attributes", function(x) standardGeneric("collect_in ) } -setMethod("collect_integer_attributes", "array", .simple_integer_collector) - -setMethod("collect_integer_attributes", "ANY", function(x) { - collated <- blockApply(x, .simple_integer_collector) +.combine_integer_attributes <- function(collated) { list( range=aggregate_range(collated, "range"), missing=aggregate_any(collated, "missing") ) +} + +setMethod("collect_integer_attributes", "array", .simple_integer_collector) + +setMethod("collect_integer_attributes", "SVT_SparseMatrix", function(x) { + collated <- lapply(x@SVT, function(y) { + out <- .simple_integer_collector(y[[2]]) + out$non_zero <- length(y[[2]]) + out + }) + output <- list(non_zero=aggregate_sum(collated, "non_zero")) + c(output, .combine_integer_attributes(collated)) +}) + +#' @importFrom S4Arrays is_sparse +#' @importFrom SparseArray nzdata +setMethod("collect_integer_attributes", "ANY", function(x) { + output <- list() + + if (is_sparse(x)) { + collated <- blockApply(x, function(y) { + out <- .simple_integer_collector(nzdata(y)) + out$non_zero <- length(nzdata(y)) + out + }, as.sparse=TRUE) + output$non_zero <- aggregate_sum(collated, "non_zero") + } else { + collated <- blockApply(x, .simple_integer_collector) + } + + c(output, .combine_integer_attributes(collated)) }) optimize_integer_storage <- function(x) { @@ -57,47 +89,47 @@ optimize_integer_storage <- function(x) { lower <- attr$range[1] upper <- attr$range[2] if (is.infinite(lower)) { - return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7))) + return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7), size=attr$non_zero)) } if (lower < 0L) { if (lower > -2^7 && upper < 2^7) { - return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7))) + return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7), size=attr$non_zero)) } else if (lower > -2^15 && upper < 2^15) { - return(list(type="H5T_NATIVE_INT16", placeholder=as.integer(-2^15))) + return(list(type="H5T_NATIVE_INT16", placeholder=as.integer(-2^15), size=attr$non_zero)) } } else { if (upper < 2^8 - 1) { - return(list(type="H5T_NATIVE_UINT8", placeholder=as.integer(2^8-1))) + return(list(type="H5T_NATIVE_UINT8", placeholder=as.integer(2^8-1), size=attr$non_zero)) } else if (upper < 2^16 - 1) { - return(list(type="H5T_NATIVE_UINT16", placeholder=as.integer(2^16-1))) + return(list(type="H5T_NATIVE_UINT16", placeholder=as.integer(2^16-1), size=attr$non_zero)) } } - return(list(type="H5T_NATIVE_INT32", placeholder=NA_integer_)) + return(list(type="H5T_NATIVE_INT32", placeholder=NA_integer_, size=attr$non_zero)) } else { lower <- attr$range[1] upper <- attr$range[2] if (is.infinite(lower)) { - return(list(type="H5T_NATIVE_INT8", placeholder=NULL)) + return(list(type="H5T_NATIVE_INT8", placeholder=NULL, size=attr$non_zero)) } if (lower < 0L) { if (lower >= -2^7 && upper < 2^7) { - return(list(type="H5T_NATIVE_INT8", placeholder=NULL)) + return(list(type="H5T_NATIVE_INT8", placeholder=NULL, size=attr$non_zero)) } else if (lower >= -2^15 && upper < 2^15) { - return(list(type="H5T_NATIVE_INT16", placeholder=NULL)) + return(list(type="H5T_NATIVE_INT16", placeholder=NULL, size=attr$non_zero)) } } else { if (upper < 2^8) { - return(list(type="H5T_NATIVE_UINT8", placeholder=NULL)) + return(list(type="H5T_NATIVE_UINT8", placeholder=NULL, size=attr$non_zero)) } else if (upper < 2^16) { - return(list(type="H5T_NATIVE_UINT16", placeholder=NULL)) + return(list(type="H5T_NATIVE_UINT16", placeholder=NULL, size=attr$non_zero)) } } - return(list(type="H5T_NATIVE_INT32", placeholder=NULL)) + return(list(type="H5T_NATIVE_INT32", placeholder=NULL, size=attr$non_zero)) } } @@ -110,15 +142,46 @@ setMethod("collect_float_attributes", "array", collect_double_attributes) setMethod("collect_float_attributes", "ddenseMatrix", function(x) collect_double_attributes(x@x)) -setMethod("collect_float_attributes", "ANY", function(x) { - collated <- blockApply(x, collect_double_attributes) +setMethod("collect_float_attributes", "dsparseMatrix", function(x) { + out <- collect_double_attributes(x@x) + out$non_zero <- length(x@x) + out +}) +.combine_float_attributes <- function(collated) { output <- list(range=aggregate_range(collated, "range")) for (n in c("missing", "non_integer", "has_NaN", "has_Inf", "has_nInf", "has_lowest", "has_highest")) { output[[n]] <- aggregate_any(collated, n) } - output +} + +setMethod("collect_float_attributes", "SVT_SparseMatrix", function(x) { + collated <- lapply(x@SVT, function(y) { + out <- collect_double_attributes(y[[2]]) + out$non_zero <- length(y[[2]]) + out + }) + output <- list(non_zero=aggregate_sum(collated, "non_zero")) + c(output, .combine_float_attributes(collated)) +}) + +#' @importFrom S4Arrays is_sparse +#' @importFrom SparseArray nzdata +setMethod("collect_float_attributes", "ANY", function(x) { + output <- list() + if (is_sparse(x)) { + collated <- blockApply(x, function(y) { + nzd <- nzdata(y) + out <- collect_double_attributes(nzd) + out$non_zero <- length(nzd) + out + }, as.sparse=TRUE) + output$non_zero <- aggregate_sum(collated, "non_zero") + } else { + collated <- blockApply(x, collect_double_attributes) + } + c(output, .combine_float_attributes(collated)) }) optimize_float_storage <- function(x) { @@ -130,19 +193,19 @@ optimize_float_storage <- function(x) { upper <- attr$range[2] if (lower < 0L) { if (lower > -2^7 && upper < 2^7) { - return(list(type="H5T_NATIVE_INT8", placeholder=-2^7)) + return(list(type="H5T_NATIVE_INT8", placeholder=-2^7, size=attr$non_zero)) } else if (lower > -2^15 && upper < 2^15) { - return(list(type="H5T_NATIVE_INT16", placeholder=-2^15)) + return(list(type="H5T_NATIVE_INT16", placeholder=-2^15, size=attr$non_zero)) } else if (lower > -2^31 && upper < 2^31) { - return(list(type="H5T_NATIVE_INT32", placeholder=-2^31)) + return(list(type="H5T_NATIVE_INT32", placeholder=-2^31, size=attr$non_zero)) } } else { if (upper < 2^8-1) { - return(list(type="H5T_NATIVE_UINT8", placeholder=2^8-1)) + return(list(type="H5T_NATIVE_UINT8", placeholder=2^8-1, size=attr$non_zero)) } else if (upper < 2^16-1) { - return(list(type="H5T_NATIVE_UINT16", placeholder=2^16-1)) + return(list(type="H5T_NATIVE_UINT16", placeholder=2^16-1, size=attr$non_zero)) } else if (upper < 2^32-1) { - return(list(type="H5T_NATIVE_UINT32", placeholder=2^32-1)) + return(list(type="H5T_NATIVE_UINT32", placeholder=2^32-1, size=attr$non_zero)) } } } @@ -162,11 +225,15 @@ optimize_float_storage <- function(x) { # Fallback that just goes through and pulls out all unique values. if (is.null(placeholder)) { - u <- Reduce(union, blockApply(x, function(y) unique(as.vector(y)))) + if (is_sparse(x)) { + u <- Reduce(union, blockApply(x, function(y) unique(nzdata(y)))) + } else { + u <- Reduce(union, blockApply(x, function(y) unique(as.vector(y)))) + } placeholder <- chooseMissingPlaceholderForHdf5(u) } - return(list(type="H5T_NATIVE_DOUBLE", placeholder=placeholder)) + return(list(type="H5T_NATIVE_DOUBLE", placeholder=placeholder, size=attr$non_zero)) } else { if (!attr$non_integer) { @@ -174,24 +241,24 @@ optimize_float_storage <- function(x) { upper <- attr$range[2] if (lower < 0L) { if (lower >= -2^7 && upper < 2^7) { - return(list(type="H5T_NATIVE_INT8", placeholder=NULL)) + return(list(type="H5T_NATIVE_INT8", placeholder=NULL, size=attr$non_zero)) } else if (lower >= -2^15 && upper < 2^15) { - return(list(type="H5T_NATIVE_INT16", placeholder=NULL)) + return(list(type="H5T_NATIVE_INT16", placeholder=NULL, size=attr$non_zero)) } else if (lower >= -2^31 && upper < 2^31) { - return(list(type="H5T_NATIVE_INT32", placeholder=NULL)) + return(list(type="H5T_NATIVE_INT32", placeholder=NULL, size=attr$non_zero)) } } else { if (upper < 2^8) { - return(list(type="H5T_NATIVE_UINT8", placeholder=NULL)) + return(list(type="H5T_NATIVE_UINT8", placeholder=NULL, size=attr$non_zero)) } else if (upper < 2^16) { - return(list(type="H5T_NATIVE_UINT16", placeholder=NULL)) + return(list(type="H5T_NATIVE_UINT16", placeholder=NULL, size=attr$non_zero)) } else if (upper < 2^32) { - return(list(type="H5T_NATIVE_UINT32", placeholder=NULL)) + return(list(type="H5T_NATIVE_UINT32", placeholder=NULL, size=attr$non_zero)) } } } - return(list(type="H5T_NATIVE_DOUBLE", placeholder=NULL)) + return(list(type="H5T_NATIVE_DOUBLE", placeholder=NULL, size=attr$non_zero)) } } @@ -251,10 +318,37 @@ optimize_string_storage <- function(x) { ################################################### ################################################### +setGeneric("collect_boolean_attributes", function(x) standardGeneric("collect_boolean_attributes")) + +setMethod("collect_boolean_attributes", "ANY", function(x) { + output <- list() + if (is_sparse(x)) { + collated <- blockApply(x, function(x) list(missing=anyNA(nzdata(x)), non_zero=length(nzdata(x))), as.sparse=TRUE) + output$non_zero <- aggregate_sum(collated, "non_zero") + } else { + collated <- list(list(missing=anyNA(x))) + } + output$missing <- aggregate_any(collated, "missing") + output +}) + +setMethod("collect_boolean_attributes", "lsparseMatrix", function(x) { + list(missing=anyNA(x), non_zero=length(x@x)) +}) + +setMethod("collect_boolean_attributes", "SVT_SparseMatrix", function(x) { + collated <- lapply(x@SVT, function(y) list(missing=anyNA(y[[2]]), non_zero=length(y[[2]]))) + list( + missing=aggregate_any(collated, "missing"), + non_zero=aggregate_sum(collated, "non_zero") + ) +}) + optimize_boolean_storage <- function(x) { - if (anyNA(x)) { - list(type="H5T_NATIVE_INT8", placeholder=-1L) + attr <- collect_boolean_attributes(x) + if (attr$missing) { + list(type="H5T_NATIVE_INT8", placeholder=-1L, size=attr$non_zero) } else { - list(type="H5T_NATIVE_INT8", placeholder=NULL) + list(type="H5T_NATIVE_INT8", placeholder=NULL, size=attr$non_zero) } } diff --git a/tests/testthat/test-optimize_storage.R b/tests/testthat/test-optimize_storage.R index fd34074..e73b771 100644 --- a/tests/testthat/test-optimize_storage.R +++ b/tests/testthat/test-optimize_storage.R @@ -398,3 +398,55 @@ test_that("storage optimization works for booleans", { expect_equal(out$placeholder, -1L) } }) + +test_that("storage optimization works for sparse objects", { + old <- getAutoBlockSize() + setAutoBlockSize(400) + on.exit(setAutoBlockSize(old)) + + for (i in 1:3) { + if (i == 1L) { + fun <- function(x) as(x, "sparseMatrix") + } else if (i == 2L) { + fun <- function(x) as(x, "SVT_SparseMatrix") + } else { + fun <- function(x) DelayedArray(as(x, "SparseArraySeed")) + } + + # Integer. + { + y <- matrix(0L, nrow=100, ncol=10) + y[1+sample(999, 100)] <- 10000L + y[1] <- NA + + out <- alabaster.matrix:::optimize_storage(fun(y)) + expect_identical(out$type, "H5T_NATIVE_UINT16") + expect_equal(out$placeholder, 2^16-1) + expect_identical(out$size, 101L) + } + + # Boolean + { + y <- matrix(FALSE, nrow=100, ncol=10) + y[1+sample(999, 50)] <- TRUE + y[1] <- NA + + out <- alabaster.matrix:::optimize_storage(fun(y)) + expect_identical(out$type, "H5T_NATIVE_INT8") + expect_equal(out$placeholder, -1L) + expect_identical(out$size, 51L) + } + + # Number + { + y <- matrix(0, nrow=100, ncol=10) + y[1+sample(999, 200)] <- runif(200) + y[1] <- NA + + out <- alabaster.matrix:::optimize_storage(fun(y)) + expect_identical(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, NaN) + expect_identical(out$size, 201L) + } + } +})