From 4e79db069ab6a10a80e6971dd26099fca991612c Mon Sep 17 00:00:00 2001 From: LTLA Date: Mon, 27 Nov 2023 10:18:15 -0800 Subject: [PATCH] Further work on the storage optimizer. --- NAMESPACE | 2 + R/RcppExports.R | 15 + R/namespace.R | 4 + R/optimize_storage.R | 117 ++++- src/RcppExports.cpp | 52 +++ ..._attributes.cpp => collect_attributes.cpp} | 8 +- tests/testthat/test-optimize_storage.R | 400 ++++++++++++++++++ 7 files changed, 580 insertions(+), 18 deletions(-) create mode 100644 R/RcppExports.R create mode 100644 R/namespace.R create mode 100644 src/RcppExports.cpp rename src/{collect_float_attributes.cpp => collect_attributes.cpp} (94%) create mode 100644 tests/testthat/test-optimize_storage.R diff --git a/NAMESPACE b/NAMESPACE index 1a5ae73..84de153 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -67,6 +67,7 @@ importFrom(HDF5Array,H5SparseMatrixSeed) importFrom(HDF5Array,HDF5Array) importFrom(HDF5Array,HDF5ArraySeed) importFrom(HDF5Array,writeHDF5Array) +importFrom(Rcpp,sourceCpp) importFrom(S4Vectors,new2) importFrom(SparseArray,extract_sparse_array) importFrom(alabaster.base,.loadObject) @@ -91,3 +92,4 @@ importFrom(rhdf5,h5readAttributes) importFrom(rhdf5,h5write) importFrom(rhdf5,h5writeAttribute) importFrom(rhdf5,h5writeDataset) +useDynLib(alabaster.matrix, .registration=TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..6f39ffe --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,15 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +lowest_double <- function() { + .Call(`_alabaster_matrix_lowest_double`) +} + +highest_double <- function() { + .Call(`_alabaster_matrix_highest_double`) +} + +collect_double_attributes <- function(x) { + .Call(`_alabaster_matrix_collect_double_attributes`, x) +} + diff --git a/R/namespace.R b/R/namespace.R new file mode 100644 index 0000000..87315c7 --- /dev/null +++ b/R/namespace.R @@ -0,0 +1,4 @@ +#' @import alabaster.base rhdf5 +#' @importFrom Rcpp sourceCpp +#' @useDynLib alabaster.matrix, .registration=TRUE +NULL diff --git a/R/optimize_storage.R b/R/optimize_storage.R index 1c63f87..f28fdcf 100644 --- a/R/optimize_storage.R +++ b/R/optimize_storage.R @@ -1,3 +1,21 @@ +optimize_storage <- function(x) { + tt <- type(x) + if (tt == "character") { + optimize_string_storage(x) + } else if (tt == "double") { + optimize_float_storage(x) + } else if (tt == "integer") { + optimize_integer_storage(x) + } else if (tt == "logical") { + optimize_boolean_storage(x) + } else { + stop("unsupported type '", tt, "'") + } +} + +################################################### +################################################### + aggregate_range <- function(collated, name) { range(unlist(lapply(collated, function(y) y[[name]]))) } @@ -6,21 +24,26 @@ aggregate_any <- function(collated, name) { any(vapply(collated, function(y) y[[name]], TRUE)) } +aggregate_max <- function(collated, name) { + max(unlist(lapply(collated, function(y) y[[name]])), na.rm=TRUE) +} + ################################################### ################################################### setGeneric("collect_integer_attributes", function(x) standardGeneric("collect_integer_attributes")) -setMethod("collect_integer_attributes", "array", function(x) { +.simple_integer_collector <- function(x) { list( range=suppressWarnings(range(x, na.rm=TRUE)), missing=anyNA(x) ) -}) +} -setMethod("collect_integer_attributes", "ANY", function(x) { +setMethod("collect_integer_attributes", "array", .simple_integer_collector) - collated <- blockApply(x, collect_integer_attributes) +setMethod("collect_integer_attributes", "ANY", function(x) { + collated <- blockApply(x, .simple_integer_collector) list( range=aggregate_range(collated, "range"), missing=aggregate_any(collated, "missing") @@ -34,20 +57,20 @@ optimize_integer_storage <- function(x) { lower <- attr$range[1] upper <- attr$range[2] if (is.infinite(lower)) { - return(list(type="H5T_NATIVE_INT8", placeholder=-2L^7L)) + return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7))) } if (lower < 0L) { if (lower > -2^7 && upper < 2^7) { - return(list(type="H5T_NATIVE_INT8", placeholder=-2L^7L)) + return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7))) } else if (lower > -2^15 && upper < 2^15) { - return(list(type="H5T_NATIVE_INT16", placeholder=-2L^15L)) + return(list(type="H5T_NATIVE_INT16", placeholder=as.integer(-2^15))) } } else { if (upper < 2^8 - 1) { - return(list(type="H5T_NATIVE_UINT8", placeholder=2L^8L-1L)) + return(list(type="H5T_NATIVE_UINT8", placeholder=as.integer(2^8-1))) } else if (upper < 2^16 - 1) { - return(list(type="H5T_NATIVE_UINT16", placeholder=2L^16L-1L)) + return(list(type="H5T_NATIVE_UINT16", placeholder=as.integer(2^16-1))) } } @@ -83,13 +106,15 @@ optimize_integer_storage <- function(x) { setGeneric("collect_float_attributes", function(x) standardGeneric("collect_float_attributes")) -setMethod("collect_float_attributes", "array", collect_float_attributes) +setMethod("collect_float_attributes", "array", collect_double_attributes) + +setMethod("collect_float_attributes", "ddenseMatrix", function(x) collect_double_attributes(x@x)) setMethod("collect_float_attributes", "ANY", function(x) { - collated <- blockApply(x, collect_float_attributes) + collated <- blockApply(x, collect_double_attributes) output <- list(range=aggregate_range(collated, "range")) - for (n in c("missing", "specials", "non_integer", "has_NaN", "has_Inf", "has_nInf", "has_lowest", "has_highest")) { + for (n in c("missing", "non_integer", "has_NaN", "has_Inf", "has_nInf", "has_lowest", "has_highest")) { output[[n]] <- aggregate_any(collated, n) } @@ -131,8 +156,8 @@ optimize_float_storage <- function(x) { placeholder <- -Inf } else if (!attr$has_lowest) { placeholder <- lowest_double() - } else if (!attr$has_max) { - placeholder <- max_double() + } else if (!attr$has_highest) { + placeholder <- highest_double() } # Fallback that just goes through and pulls out all unique values. @@ -169,3 +194,67 @@ optimize_float_storage <- function(x) { return(list(type="H5T_NATIVE_DOUBLE", placeholder=NULL)) } } + +################################################### +################################################### + +setGeneric("collect_string_attributes", function(x) standardGeneric("collect_string_attributes")) + +setMethod("collect_string_attributes", "ANY", function(x) { + collected <- blockApply(x, function(y) { + list( + has_na1=any(y == "NA", na.rm=TRUE), + has_na2=any(y == "_NA", na.rm=TRUE), + max_len=suppressWarnings(max(nchar(y, "bytes"), na.rm=TRUE)), + missing=anyNA(y), + encoding=unique(Encoding(y)) + ) + }) + + list( + has_na1=aggregate_any(collected, "has_na1"), + has_na2=aggregate_any(collected, "has_na2"), + max_len=aggregate_max(collected, "max_len"), + missing=aggregate_any(collected, "missing"), + encoding=Reduce(union, lapply(collected, function(y) y$encoding)) + ) +}) + +optimize_string_storage <- function(x) { + attr <- collect_string_attributes(x) + + placeholder <- NULL + if (attr$missing) { + if (!attr$has_na1) { + placeholder <- "NA" + } else if (!attr$has_na2) { + placeholder <- "_NA" + } else { + u <- Reduce(union, blockApply(x, function(y) unique(as.vector(y)))) + placeholder <- chooseMissingPlaceholderForHdf5(u) + } + attr$max_len <- max(attr$max_len, nchar(placeholder, "bytes")) + } + + tid <- H5Tcopy("H5T_C_S1") + H5Tset_strpad(tid, strpad = "NULLPAD") + H5Tset_size(tid, max(1L, attr$max_len)) + if ("UTF-8" %in% attr$encoding) { + H5Tset_cset(tid, "UTF8") + } else { + H5Tset_cset(tid, "ASCII") + } + + list(type=tid, placeholder=placeholder) +} + +################################################### +################################################### + +optimize_boolean_storage <- function(x) { + if (anyNA(x)) { + list(type="H5T_NATIVE_INT8", placeholder=-1L) + } else { + list(type="H5T_NATIVE_INT8", placeholder=NULL) + } +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..318e087 --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,52 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + +// lowest_double +double lowest_double(); +RcppExport SEXP _alabaster_matrix_lowest_double() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + rcpp_result_gen = Rcpp::wrap(lowest_double()); + return rcpp_result_gen; +END_RCPP +} +// highest_double +double highest_double(); +RcppExport SEXP _alabaster_matrix_highest_double() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + rcpp_result_gen = Rcpp::wrap(highest_double()); + return rcpp_result_gen; +END_RCPP +} +// collect_double_attributes +Rcpp::List collect_double_attributes(Rcpp::NumericVector x); +RcppExport SEXP _alabaster_matrix_collect_double_attributes(SEXP xSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::traits::input_parameter< Rcpp::NumericVector >::type x(xSEXP); + rcpp_result_gen = Rcpp::wrap(collect_double_attributes(x)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_alabaster_matrix_lowest_double", (DL_FUNC) &_alabaster_matrix_lowest_double, 0}, + {"_alabaster_matrix_highest_double", (DL_FUNC) &_alabaster_matrix_highest_double, 0}, + {"_alabaster_matrix_collect_double_attributes", (DL_FUNC) &_alabaster_matrix_collect_double_attributes, 1}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_alabaster_matrix(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/collect_float_attributes.cpp b/src/collect_attributes.cpp similarity index 94% rename from src/collect_float_attributes.cpp rename to src/collect_attributes.cpp index 9023ac1..93a6c1a 100644 --- a/src/collect_float_attributes.cpp +++ b/src/collect_attributes.cpp @@ -8,7 +8,7 @@ double lowest_double() { } //[[Rcpp::export(rng=false)]] -double max_double() { +double highest_double() { return std::numeric_limits::max(); } @@ -27,7 +27,7 @@ Rcpp::List collect_double_attributes(Rcpp::NumericVector x) { bool has_neginf = false; { for (auto y : x) { - if (!ISNA(y) && !std::isnan(y)) { + if (!ISNA(y) && std::isnan(y)) { has_nan = true; break; } @@ -48,7 +48,7 @@ Rcpp::List collect_double_attributes(Rcpp::NumericVector x) { bool non_integer = true; double minv = R_PosInf, maxv = R_NegInf; - if (has_nan || has_posinf || has_neginf) { + if (!has_nan && !has_posinf && !has_neginf) { non_integer = false; for (auto y : x) { if (!ISNA(y)) { @@ -79,7 +79,7 @@ Rcpp::List collect_double_attributes(Rcpp::NumericVector x) { } } - double highest = max_double(); + double highest = highest_double(); for (auto y : x) { if (!ISNA(y) && y == highest) { has_highest = true; diff --git a/tests/testthat/test-optimize_storage.R b/tests/testthat/test-optimize_storage.R new file mode 100644 index 0000000..fd34074 --- /dev/null +++ b/tests/testthat/test-optimize_storage.R @@ -0,0 +1,400 @@ +# This checks that the storage optimization works as expected. +# library(testthat); library(alabaster.matrix); source("test-optimize_storage.R", encoding="UTF-8") + +library(DelayedArray) +library(rhdf5) + +test_that("storage optimization works for integers, no missing values", { + old <- getAutoBlockSize() + setAutoBlockSize(400) + on.exit(setAutoBlockSize(old)) + + for (i in 1:2) { + fun <- identity + if (i == 1L) { + fun <- DelayedArray + } + + # < 8-bit. + mat <- matrix(seq(0, 255, length.out=1000), 50, 20) + storage.mode(mat) <- "integer" + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT8") + expect_null(out$placeholder) + out <- alabaster.matrix:::optimize_storage(fun(mat - 128L)) + expect_equal(out$type, "H5T_NATIVE_INT8") + out <- alabaster.matrix:::optimize_storage(fun(mat + 1L)) + expect_equal(out$type, "H5T_NATIVE_UINT16") + + # < 16-bit. + mat <- matrix(seq(0, 65535, length.out=1000), 50, 20) + storage.mode(mat) <- "integer" + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_null(out$placeholder) + out <- alabaster.matrix:::optimize_storage(fun(mat - 32768L)) + expect_equal(out$type, "H5T_NATIVE_INT16") + out <- alabaster.matrix:::optimize_storage(fun(mat + 1L)) + expect_equal(out$type, "H5T_NATIVE_INT32") + + # < 32-bit. + mat <- matrix(seq(0, .Machine$integer.max, length.out=1000), 50, 20) + storage.mode(mat) <- "integer" + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_null(out$placeholder) + } +}) + +test_that("storage optimization works for integers, plus missing values", { + old <- getAutoBlockSize() + setAutoBlockSize(400) + on.exit(setAutoBlockSize(old)) + + for (i in 1:2) { + fun <- identity + if (i == 1L) { + fun <- DelayedArray + } + + # < 8-bit, unsigned. + mat <- matrix(seq(0, 255, length.out=1000), 50, 20) + mat[1000] <- NA + storage.mode(mat) <- "integer" # Note that cast to int does a truncation, so there's only ever one value at a non-zero extreme. + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT8") + expect_equal(out$placeholder, 255L) + + mat[999] <- 255L + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_equal(out$placeholder, 65535L) + + # 8-bit, signed. + mat <- matrix(seq(-128, 127, length.out=1000), 50, 20) + mat[1] <- NA + storage.mode(mat) <- "integer" + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT8") + expect_equal(out$placeholder, -128L) + + mat[2] <- -128L + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT16") + expect_equal(out$placeholder, -32768L) + + # < 16-bit, unsigned. + mat <- matrix(seq(0, 65535, length.out=1000), 50, 20) + mat[1000] <- NA + storage.mode(mat) <- "integer" + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_equal(out$placeholder, 65535) + + mat[999] <- 65535L + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_equal(out$placeholder, NA_integer_) + + # 16-bit, signed. + mat <- matrix(seq(-32768, 32767, length.out=1000), 50, 20) + mat[1] <- NA + storage.mode(mat) <- "integer" + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT16") + expect_equal(out$placeholder, -32768L) + + mat[2] <- -32768L + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_equal(out$placeholder, NA_integer_) + + # Everything else. + mat <- matrix(seq(0, .Machine$integer.max, length.out=1000), 50, 20) + storage.mode(mat) <- "integer" + mat[1] <- NA + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_equal(out$placeholder, NA_integer_) + } +}) + +test_that("storage optimization works for integer-like doubles, no missing values", { + old <- getAutoBlockSize() + setAutoBlockSize(400) + on.exit(setAutoBlockSize(old)) + + for (i in 1:2) { + fun <- identity + if (i == 1L) { + fun <- DelayedArray + } + + # < 8-bit. + mat <- round(matrix(seq(0, 255, length.out=1000), 50, 20)) + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT8") + expect_null(out$placeholder) + out <- alabaster.matrix:::optimize_storage(fun(mat - 128L)) + expect_equal(out$type, "H5T_NATIVE_INT8") + out <- alabaster.matrix:::optimize_storage(fun(mat + 1L)) + expect_equal(out$type, "H5T_NATIVE_UINT16") + + # < 16-bit. + mat <- round(matrix(seq(0, 65535, length.out=1000), 50, 20)) + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_null(out$placeholder) + out <- alabaster.matrix:::optimize_storage(fun(mat - 32768L)) + expect_equal(out$type, "H5T_NATIVE_INT16") + out <- alabaster.matrix:::optimize_storage(fun(mat + 1L)) + expect_equal(out$type, "H5T_NATIVE_UINT32") + + # < 32-bit. + mat <- round(matrix(seq(0, 2^32-1, length.out=1000), 50, 20)) + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT32") + expect_null(out$placeholder) + out <- alabaster.matrix:::optimize_storage(fun(mat - 2^31)) + expect_equal(out$type, "H5T_NATIVE_INT32") + out <- alabaster.matrix:::optimize_storage(fun(mat + 1L)) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + } +}) + +test_that("storage optimization works for integer-like doubles, plus missing values", { + old <- getAutoBlockSize() + setAutoBlockSize(400) + on.exit(setAutoBlockSize(old)) + + for (i in 1:2) { + fun <- identity + if (i == 1L) { + fun <- DelayedArray + } + + # < 8-bit, unsigned + mat <- round(matrix(seq(0, 255, length.out=1000), 50, 20)) + mat[mat == 255] <- NA + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT8") + expect_equal(out$placeholder, 255) + + mat[999] <- 255 + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_equal(out$placeholder, 65535) + + # < 8-bit, signed + mat <- round(matrix(seq(-128, 127, length.out=1000), 50, 20)) + mat[mat==-128] <- NA + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT8") + expect_equal(out$placeholder, -128) + + mat[2] <- -128 + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT16") + expect_equal(out$placeholder, -32768) + + # < 16-bit, unsigned + mat <- round(matrix(seq(0, 65535, length.out=1000), 50, 20)) + mat[mat==65535] <- NA + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_equal(out$placeholder, 65535) + + mat[999] <- 65535 + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT32") + expect_equal(out$placeholder, 2^32-1) + + # < 16-bit, signed + mat <- round(matrix(seq(-2^15, 2^15-1, length.out=1000), 50, 20)) + mat[mat==-32768] <- NA + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT16") + expect_equal(out$placeholder, -32768) + + mat[2] <- -32768 + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_equal(out$placeholder, -2^31) + + # < 32-bit, unsigned + mat <- round(matrix(seq(0, 2^32-1, length.out=1000), 50, 20)) + mat[mat==2^32-1] <- NA + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_UINT32") + expect_equal(out$placeholder, 2^32-1) + + mat[999] <- 2^32-1 + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, NaN) + + # < 32-bit, signed + mat <- round(matrix(seq(-2^31, 2^31-1, length.out=1000), 50, 20)) + mat[mat == -2^31] <- NA + + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_equal(out$placeholder, -2^31) + + mat[999] <- -2^31 + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, NaN) + } +}) + +test_that("storage optimization works non-integer doubles", { + old <- getAutoBlockSize() + setAutoBlockSize(400) + on.exit(setAutoBlockSize(old)) + + for (i in 1:2) { + fun <- identity + if (i == 1L) { + fun <- DelayedArray + } + + mat <- matrix(rnorm(1000), 50, 20) + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_null(out$placeholder) + + # Running through the gamut of missing value placeholders. + mat <- matrix(rnorm(1000), 50, 20) + mat[1] <- NA + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, NaN) + + mat[2] <- NaN + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, Inf) + + mat[3] <- Inf + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, -Inf) + + mat[4] <- -Inf + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, alabaster.matrix:::lowest_double()) + + mat[5] <- alabaster.matrix:::lowest_double() + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, alabaster.matrix:::highest_double()) + + mat[6] <- alabaster.matrix:::highest_double() + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_true(!any(mat==out$placeholder, na.rm=TRUE)) + } +}) + +test_that("storage optimization works for strings", { + old <- getAutoBlockSize() + setAutoBlockSize(400) + on.exit(setAutoBlockSize(old)) + + for (i in 1:2) { + fun <- identity + if (i == 1L) { + fun <- DelayedArray + } + + mat <- matrix(sample(LETTERS, 1000, replace=TRUE), 50, 20) + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(H5Tget_size(out$type), 1) + expect_equal(H5Tget_cset(out$type), 0) # aka ASCII + expect_equal(out$placeholder, NULL) + + mat[1] <- NA + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(H5Tget_size(out$type), 2) + expect_equal(out$placeholder, "NA") + + mat[2] <- "NA" + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(H5Tget_size(out$type), 3) + expect_equal(out$placeholder, "_NA") + + mat[3] <- "_NA" + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(H5Tget_size(out$type), 4) + expect_equal(out$placeholder, "__NA") + + # Correct size determination. + mat <- matrix(sample(LETTERS, 1000, replace=TRUE), 50, 20) + mat[1] <- "Aaron" + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(H5Tget_size(out$type), 5) + + mat <- matrix(sample(LETTERS, 1000, replace=TRUE), 50, 20) + mat[1000] <- "Aaron" + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(H5Tget_size(out$type), 5) + + mat <- matrix("", 50, 20) + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(H5Tget_size(out$type), 1) + + # Checking for correct behavior in all-NA cases. + mat <- matrix(NA_character_, 50, 20) + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(H5Tget_size(out$type), 2) + expect_identical(out$placeholder, "NA") + + # Handles UTF-8. Assumes that this file was sourced as UTF-8. + mat <- matrix("α", 50, 20) + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(H5Tget_size(out$type), 2) + if (Encoding(mat[1]) == "UTF-8") { + expect_equal(H5Tget_cset(out$type), 1) # aka UTF-8 + } + } +}) + +test_that("storage optimization works for booleans", { + old <- getAutoBlockSize() + setAutoBlockSize(400) + on.exit(setAutoBlockSize(old)) + + for (i in 1:2) { + fun <- identity + if (i == 1L) { + fun <- DelayedArray + } + + mat <- matrix(c(TRUE, FALSE), 50, 20) + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT8") + expect_null(out$placeholder) + + mat[1] <- NA + out <- alabaster.matrix:::optimize_storage(fun(mat)) + expect_equal(out$type, "H5T_NATIVE_INT8") + expect_equal(out$placeholder, -1L) + } +})