Skip to content

Commit

Permalink
Extend the optimizer so that it works for sparse arrays.
Browse files Browse the repository at this point in the history
This includes the usual sparse Matrix classes, SVT_SparseMatrix objects,
and sparse block processing of everything else.
  • Loading branch information
LTLA committed Nov 27, 2023
1 parent d52ab0b commit e2dba08
Show file tree
Hide file tree
Showing 2 changed files with 183 additions and 37 deletions.
168 changes: 131 additions & 37 deletions R/optimize_storage.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ aggregate_max <- function(collated, name) {
max(unlist(lapply(collated, function(y) y[[name]])), na.rm=TRUE)
}

aggregate_sum <- function(collated, name) {
sum(vapply(collated, function(y) y[[name]], 0L))
}

###################################################
###################################################

Expand All @@ -40,14 +44,42 @@ setGeneric("collect_integer_attributes", function(x) standardGeneric("collect_in
)
}

setMethod("collect_integer_attributes", "array", .simple_integer_collector)

setMethod("collect_integer_attributes", "ANY", function(x) {
collated <- blockApply(x, .simple_integer_collector)
.combine_integer_attributes <- function(collated) {
list(
range=aggregate_range(collated, "range"),
missing=aggregate_any(collated, "missing")
)
}

setMethod("collect_integer_attributes", "array", .simple_integer_collector)

setMethod("collect_integer_attributes", "SVT_SparseMatrix", function(x) {
collated <- lapply(x@SVT, function(y) {
out <- .simple_integer_collector(y[[2]])
out$non_zero <- length(y[[2]])
out
})
output <- list(non_zero=aggregate_sum(collated, "non_zero"))
c(output, .combine_integer_attributes(collated))
})

#' @importFrom S4Arrays is_sparse
#' @importFrom SparseArray nzdata
setMethod("collect_integer_attributes", "ANY", function(x) {
output <- list()

if (is_sparse(x)) {
collated <- blockApply(x, function(y) {
out <- .simple_integer_collector(nzdata(y))
out$non_zero <- length(nzdata(y))
out
}, as.sparse=TRUE)
output$non_zero <- aggregate_sum(collated, "non_zero")
} else {
collated <- blockApply(x, .simple_integer_collector)
}

c(output, .combine_integer_attributes(collated))
})

optimize_integer_storage <- function(x) {
Expand All @@ -57,47 +89,47 @@ optimize_integer_storage <- function(x) {
lower <- attr$range[1]
upper <- attr$range[2]
if (is.infinite(lower)) {
return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7)))
return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7), size=attr$non_zero))
}

if (lower < 0L) {
if (lower > -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7)))
return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7), size=attr$non_zero))
} else if (lower > -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=as.integer(-2^15)))
return(list(type="H5T_NATIVE_INT16", placeholder=as.integer(-2^15), size=attr$non_zero))
}
} else {
if (upper < 2^8 - 1) {
return(list(type="H5T_NATIVE_UINT8", placeholder=as.integer(2^8-1)))
return(list(type="H5T_NATIVE_UINT8", placeholder=as.integer(2^8-1), size=attr$non_zero))
} else if (upper < 2^16 - 1) {
return(list(type="H5T_NATIVE_UINT16", placeholder=as.integer(2^16-1)))
return(list(type="H5T_NATIVE_UINT16", placeholder=as.integer(2^16-1), size=attr$non_zero))
}
}

return(list(type="H5T_NATIVE_INT32", placeholder=NA_integer_))
return(list(type="H5T_NATIVE_INT32", placeholder=NA_integer_, size=attr$non_zero))

} else {
lower <- attr$range[1]
upper <- attr$range[2]
if (is.infinite(lower)) {
return(list(type="H5T_NATIVE_INT8", placeholder=NULL))
return(list(type="H5T_NATIVE_INT8", placeholder=NULL, size=attr$non_zero))
}

if (lower < 0L) {
if (lower >= -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=NULL))
return(list(type="H5T_NATIVE_INT8", placeholder=NULL, size=attr$non_zero))
} else if (lower >= -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=NULL))
return(list(type="H5T_NATIVE_INT16", placeholder=NULL, size=attr$non_zero))
}
} else {
if (upper < 2^8) {
return(list(type="H5T_NATIVE_UINT8", placeholder=NULL))
return(list(type="H5T_NATIVE_UINT8", placeholder=NULL, size=attr$non_zero))
} else if (upper < 2^16) {
return(list(type="H5T_NATIVE_UINT16", placeholder=NULL))
return(list(type="H5T_NATIVE_UINT16", placeholder=NULL, size=attr$non_zero))
}
}

return(list(type="H5T_NATIVE_INT32", placeholder=NULL))
return(list(type="H5T_NATIVE_INT32", placeholder=NULL, size=attr$non_zero))
}
}

Expand All @@ -110,15 +142,46 @@ setMethod("collect_float_attributes", "array", collect_double_attributes)

setMethod("collect_float_attributes", "ddenseMatrix", function(x) collect_double_attributes(x@x))

setMethod("collect_float_attributes", "ANY", function(x) {
collated <- blockApply(x, collect_double_attributes)
setMethod("collect_float_attributes", "dsparseMatrix", function(x) {
out <- collect_double_attributes(x@x)
out$non_zero <- length(x@x)
out
})

.combine_float_attributes <- function(collated) {
output <- list(range=aggregate_range(collated, "range"))
for (n in c("missing", "non_integer", "has_NaN", "has_Inf", "has_nInf", "has_lowest", "has_highest")) {
output[[n]] <- aggregate_any(collated, n)
}

output
}

setMethod("collect_float_attributes", "SVT_SparseMatrix", function(x) {
collated <- lapply(x@SVT, function(y) {
out <- collect_double_attributes(y[[2]])
out$non_zero <- length(y[[2]])
out
})
output <- list(non_zero=aggregate_sum(collated, "non_zero"))
c(output, .combine_float_attributes(collated))
})

#' @importFrom S4Arrays is_sparse
#' @importFrom SparseArray nzdata
setMethod("collect_float_attributes", "ANY", function(x) {
output <- list()
if (is_sparse(x)) {
collated <- blockApply(x, function(y) {
nzd <- nzdata(y)
out <- collect_double_attributes(nzd)
out$non_zero <- length(nzd)
out
}, as.sparse=TRUE)
output$non_zero <- aggregate_sum(collated, "non_zero")
} else {
collated <- blockApply(x, collect_double_attributes)
}
c(output, .combine_float_attributes(collated))
})

optimize_float_storage <- function(x) {
Expand All @@ -130,19 +193,19 @@ optimize_float_storage <- function(x) {
upper <- attr$range[2]
if (lower < 0L) {
if (lower > -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=-2^7))
return(list(type="H5T_NATIVE_INT8", placeholder=-2^7, size=attr$non_zero))
} else if (lower > -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=-2^15))
return(list(type="H5T_NATIVE_INT16", placeholder=-2^15, size=attr$non_zero))
} else if (lower > -2^31 && upper < 2^31) {
return(list(type="H5T_NATIVE_INT32", placeholder=-2^31))
return(list(type="H5T_NATIVE_INT32", placeholder=-2^31, size=attr$non_zero))
}
} else {
if (upper < 2^8-1) {
return(list(type="H5T_NATIVE_UINT8", placeholder=2^8-1))
return(list(type="H5T_NATIVE_UINT8", placeholder=2^8-1, size=attr$non_zero))
} else if (upper < 2^16-1) {
return(list(type="H5T_NATIVE_UINT16", placeholder=2^16-1))
return(list(type="H5T_NATIVE_UINT16", placeholder=2^16-1, size=attr$non_zero))
} else if (upper < 2^32-1) {
return(list(type="H5T_NATIVE_UINT32", placeholder=2^32-1))
return(list(type="H5T_NATIVE_UINT32", placeholder=2^32-1, size=attr$non_zero))
}
}
}
Expand All @@ -162,36 +225,40 @@ optimize_float_storage <- function(x) {

# Fallback that just goes through and pulls out all unique values.
if (is.null(placeholder)) {
u <- Reduce(union, blockApply(x, function(y) unique(as.vector(y))))
if (is_sparse(x)) {
u <- Reduce(union, blockApply(x, function(y) unique(nzdata(y))))
} else {
u <- Reduce(union, blockApply(x, function(y) unique(as.vector(y))))
}
placeholder <- chooseMissingPlaceholderForHdf5(u)
}

return(list(type="H5T_NATIVE_DOUBLE", placeholder=placeholder))
return(list(type="H5T_NATIVE_DOUBLE", placeholder=placeholder, size=attr$non_zero))

} else {
if (!attr$non_integer) {
lower <- attr$range[1]
upper <- attr$range[2]
if (lower < 0L) {
if (lower >= -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=NULL))
return(list(type="H5T_NATIVE_INT8", placeholder=NULL, size=attr$non_zero))
} else if (lower >= -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=NULL))
return(list(type="H5T_NATIVE_INT16", placeholder=NULL, size=attr$non_zero))
} else if (lower >= -2^31 && upper < 2^31) {
return(list(type="H5T_NATIVE_INT32", placeholder=NULL))
return(list(type="H5T_NATIVE_INT32", placeholder=NULL, size=attr$non_zero))
}
} else {
if (upper < 2^8) {
return(list(type="H5T_NATIVE_UINT8", placeholder=NULL))
return(list(type="H5T_NATIVE_UINT8", placeholder=NULL, size=attr$non_zero))
} else if (upper < 2^16) {
return(list(type="H5T_NATIVE_UINT16", placeholder=NULL))
return(list(type="H5T_NATIVE_UINT16", placeholder=NULL, size=attr$non_zero))
} else if (upper < 2^32) {
return(list(type="H5T_NATIVE_UINT32", placeholder=NULL))
return(list(type="H5T_NATIVE_UINT32", placeholder=NULL, size=attr$non_zero))
}
}
}

return(list(type="H5T_NATIVE_DOUBLE", placeholder=NULL))
return(list(type="H5T_NATIVE_DOUBLE", placeholder=NULL, size=attr$non_zero))
}
}

Expand Down Expand Up @@ -251,10 +318,37 @@ optimize_string_storage <- function(x) {
###################################################
###################################################

setGeneric("collect_boolean_attributes", function(x) standardGeneric("collect_boolean_attributes"))

setMethod("collect_boolean_attributes", "ANY", function(x) {
output <- list()
if (is_sparse(x)) {
collated <- blockApply(x, function(x) list(missing=anyNA(nzdata(x)), non_zero=length(nzdata(x))), as.sparse=TRUE)
output$non_zero <- aggregate_sum(collated, "non_zero")
} else {
collated <- list(list(missing=anyNA(x)))
}
output$missing <- aggregate_any(collated, "missing")
output
})

setMethod("collect_boolean_attributes", "lsparseMatrix", function(x) {
list(missing=anyNA(x), non_zero=length(x@x))
})

setMethod("collect_boolean_attributes", "SVT_SparseMatrix", function(x) {
collated <- lapply(x@SVT, function(y) list(missing=anyNA(y[[2]]), non_zero=length(y[[2]])))
list(
missing=aggregate_any(collated, "missing"),
non_zero=aggregate_sum(collated, "non_zero")
)
})

optimize_boolean_storage <- function(x) {
if (anyNA(x)) {
list(type="H5T_NATIVE_INT8", placeholder=-1L)
attr <- collect_boolean_attributes(x)
if (attr$missing) {
list(type="H5T_NATIVE_INT8", placeholder=-1L, size=attr$non_zero)
} else {
list(type="H5T_NATIVE_INT8", placeholder=NULL)
list(type="H5T_NATIVE_INT8", placeholder=NULL, size=attr$non_zero)
}
}
52 changes: 52 additions & 0 deletions tests/testthat/test-optimize_storage.R
Original file line number Diff line number Diff line change
Expand Up @@ -398,3 +398,55 @@ test_that("storage optimization works for booleans", {
expect_equal(out$placeholder, -1L)
}
})

test_that("storage optimization works for sparse objects", {
old <- getAutoBlockSize()
setAutoBlockSize(400)
on.exit(setAutoBlockSize(old))

for (i in 1:3) {
if (i == 1L) {
fun <- function(x) as(x, "sparseMatrix")
} else if (i == 2L) {
fun <- function(x) as(x, "SVT_SparseMatrix")
} else {
fun <- function(x) DelayedArray(as(x, "SparseArraySeed"))
}

# Integer.
{
y <- matrix(0L, nrow=100, ncol=10)
y[1+sample(999, 100)] <- 10000L
y[1] <- NA

out <- alabaster.matrix:::optimize_storage(fun(y))
expect_identical(out$type, "H5T_NATIVE_UINT16")
expect_equal(out$placeholder, 2^16-1)
expect_identical(out$size, 101L)
}

# Boolean
{
y <- matrix(FALSE, nrow=100, ncol=10)
y[1+sample(999, 50)] <- TRUE
y[1] <- NA

out <- alabaster.matrix:::optimize_storage(fun(y))
expect_identical(out$type, "H5T_NATIVE_INT8")
expect_equal(out$placeholder, -1L)
expect_identical(out$size, 51L)
}

# Number
{
y <- matrix(0, nrow=100, ncol=10)
y[1+sample(999, 200)] <- runif(200)
y[1] <- NA

out <- alabaster.matrix:::optimize_storage(fun(y))
expect_identical(out$type, "H5T_NATIVE_DOUBLE")
expect_equal(out$placeholder, NaN)
expect_identical(out$size, 201L)
}
}
})

0 comments on commit e2dba08

Please sign in to comment.