Skip to content

Commit

Permalink
Further work on the storage optimizer.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Nov 27, 2023
1 parent fc1eeba commit 4e79db0
Show file tree
Hide file tree
Showing 7 changed files with 580 additions and 18 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ importFrom(HDF5Array,H5SparseMatrixSeed)
importFrom(HDF5Array,HDF5Array)
importFrom(HDF5Array,HDF5ArraySeed)
importFrom(HDF5Array,writeHDF5Array)
importFrom(Rcpp,sourceCpp)
importFrom(S4Vectors,new2)
importFrom(SparseArray,extract_sparse_array)
importFrom(alabaster.base,.loadObject)
Expand All @@ -91,3 +92,4 @@ importFrom(rhdf5,h5readAttributes)
importFrom(rhdf5,h5write)
importFrom(rhdf5,h5writeAttribute)
importFrom(rhdf5,h5writeDataset)
useDynLib(alabaster.matrix, .registration=TRUE)
15 changes: 15 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

lowest_double <- function() {
.Call(`_alabaster_matrix_lowest_double`)
}

highest_double <- function() {
.Call(`_alabaster_matrix_highest_double`)
}

collect_double_attributes <- function(x) {
.Call(`_alabaster_matrix_collect_double_attributes`, x)
}

4 changes: 4 additions & 0 deletions R/namespace.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#' @import alabaster.base rhdf5
#' @importFrom Rcpp sourceCpp
#' @useDynLib alabaster.matrix, .registration=TRUE
NULL
117 changes: 103 additions & 14 deletions R/optimize_storage.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
optimize_storage <- function(x) {
tt <- type(x)
if (tt == "character") {
optimize_string_storage(x)
} else if (tt == "double") {
optimize_float_storage(x)
} else if (tt == "integer") {
optimize_integer_storage(x)
} else if (tt == "logical") {
optimize_boolean_storage(x)
} else {
stop("unsupported type '", tt, "'")
}
}

###################################################
###################################################

aggregate_range <- function(collated, name) {
range(unlist(lapply(collated, function(y) y[[name]])))
}
Expand All @@ -6,21 +24,26 @@ aggregate_any <- function(collated, name) {
any(vapply(collated, function(y) y[[name]], TRUE))
}

aggregate_max <- function(collated, name) {
max(unlist(lapply(collated, function(y) y[[name]])), na.rm=TRUE)
}

###################################################
###################################################

setGeneric("collect_integer_attributes", function(x) standardGeneric("collect_integer_attributes"))

setMethod("collect_integer_attributes", "array", function(x) {
.simple_integer_collector <- function(x) {
list(
range=suppressWarnings(range(x, na.rm=TRUE)),
missing=anyNA(x)
)
})
}

setMethod("collect_integer_attributes", "ANY", function(x) {
setMethod("collect_integer_attributes", "array", .simple_integer_collector)

collated <- blockApply(x, collect_integer_attributes)
setMethod("collect_integer_attributes", "ANY", function(x) {
collated <- blockApply(x, .simple_integer_collector)
list(
range=aggregate_range(collated, "range"),
missing=aggregate_any(collated, "missing")
Expand All @@ -34,20 +57,20 @@ optimize_integer_storage <- function(x) {
lower <- attr$range[1]
upper <- attr$range[2]
if (is.infinite(lower)) {
return(list(type="H5T_NATIVE_INT8", placeholder=-2L^7L))
return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7)))
}

if (lower < 0L) {
if (lower > -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=-2L^7L))
return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7)))
} else if (lower > -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=-2L^15L))
return(list(type="H5T_NATIVE_INT16", placeholder=as.integer(-2^15)))
}
} else {
if (upper < 2^8 - 1) {
return(list(type="H5T_NATIVE_UINT8", placeholder=2L^8L-1L))
return(list(type="H5T_NATIVE_UINT8", placeholder=as.integer(2^8-1)))
} else if (upper < 2^16 - 1) {
return(list(type="H5T_NATIVE_UINT16", placeholder=2L^16L-1L))
return(list(type="H5T_NATIVE_UINT16", placeholder=as.integer(2^16-1)))
}
}

Expand Down Expand Up @@ -83,13 +106,15 @@ optimize_integer_storage <- function(x) {

setGeneric("collect_float_attributes", function(x) standardGeneric("collect_float_attributes"))

setMethod("collect_float_attributes", "array", collect_float_attributes)
setMethod("collect_float_attributes", "array", collect_double_attributes)

setMethod("collect_float_attributes", "ddenseMatrix", function(x) collect_double_attributes(x@x))

setMethod("collect_float_attributes", "ANY", function(x) {
collated <- blockApply(x, collect_float_attributes)
collated <- blockApply(x, collect_double_attributes)

output <- list(range=aggregate_range(collated, "range"))
for (n in c("missing", "specials", "non_integer", "has_NaN", "has_Inf", "has_nInf", "has_lowest", "has_highest")) {
for (n in c("missing", "non_integer", "has_NaN", "has_Inf", "has_nInf", "has_lowest", "has_highest")) {
output[[n]] <- aggregate_any(collated, n)
}

Expand Down Expand Up @@ -131,8 +156,8 @@ optimize_float_storage <- function(x) {
placeholder <- -Inf
} else if (!attr$has_lowest) {
placeholder <- lowest_double()
} else if (!attr$has_max) {
placeholder <- max_double()
} else if (!attr$has_highest) {
placeholder <- highest_double()
}

# Fallback that just goes through and pulls out all unique values.
Expand Down Expand Up @@ -169,3 +194,67 @@ optimize_float_storage <- function(x) {
return(list(type="H5T_NATIVE_DOUBLE", placeholder=NULL))
}
}

###################################################
###################################################

setGeneric("collect_string_attributes", function(x) standardGeneric("collect_string_attributes"))

setMethod("collect_string_attributes", "ANY", function(x) {
collected <- blockApply(x, function(y) {
list(
has_na1=any(y == "NA", na.rm=TRUE),
has_na2=any(y == "_NA", na.rm=TRUE),
max_len=suppressWarnings(max(nchar(y, "bytes"), na.rm=TRUE)),
missing=anyNA(y),
encoding=unique(Encoding(y))
)
})

list(
has_na1=aggregate_any(collected, "has_na1"),
has_na2=aggregate_any(collected, "has_na2"),
max_len=aggregate_max(collected, "max_len"),
missing=aggregate_any(collected, "missing"),
encoding=Reduce(union, lapply(collected, function(y) y$encoding))
)
})

optimize_string_storage <- function(x) {
attr <- collect_string_attributes(x)

placeholder <- NULL
if (attr$missing) {
if (!attr$has_na1) {
placeholder <- "NA"
} else if (!attr$has_na2) {
placeholder <- "_NA"
} else {
u <- Reduce(union, blockApply(x, function(y) unique(as.vector(y))))
placeholder <- chooseMissingPlaceholderForHdf5(u)
}
attr$max_len <- max(attr$max_len, nchar(placeholder, "bytes"))
}

tid <- H5Tcopy("H5T_C_S1")
H5Tset_strpad(tid, strpad = "NULLPAD")
H5Tset_size(tid, max(1L, attr$max_len))
if ("UTF-8" %in% attr$encoding) {
H5Tset_cset(tid, "UTF8")
} else {
H5Tset_cset(tid, "ASCII")
}

list(type=tid, placeholder=placeholder)
}

###################################################
###################################################

optimize_boolean_storage <- function(x) {
if (anyNA(x)) {
list(type="H5T_NATIVE_INT8", placeholder=-1L)
} else {
list(type="H5T_NATIVE_INT8", placeholder=NULL)
}
}
52 changes: 52 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#include <Rcpp.h>

using namespace Rcpp;

#ifdef RCPP_USE_GLOBAL_ROSTREAM
Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
#endif

// lowest_double
double lowest_double();
RcppExport SEXP _alabaster_matrix_lowest_double() {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
rcpp_result_gen = Rcpp::wrap(lowest_double());
return rcpp_result_gen;
END_RCPP
}
// highest_double
double highest_double();
RcppExport SEXP _alabaster_matrix_highest_double() {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
rcpp_result_gen = Rcpp::wrap(highest_double());
return rcpp_result_gen;
END_RCPP
}
// collect_double_attributes
Rcpp::List collect_double_attributes(Rcpp::NumericVector x);
RcppExport SEXP _alabaster_matrix_collect_double_attributes(SEXP xSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::traits::input_parameter< Rcpp::NumericVector >::type x(xSEXP);
rcpp_result_gen = Rcpp::wrap(collect_double_attributes(x));
return rcpp_result_gen;
END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_alabaster_matrix_lowest_double", (DL_FUNC) &_alabaster_matrix_lowest_double, 0},
{"_alabaster_matrix_highest_double", (DL_FUNC) &_alabaster_matrix_highest_double, 0},
{"_alabaster_matrix_collect_double_attributes", (DL_FUNC) &_alabaster_matrix_collect_double_attributes, 1},
{NULL, NULL, 0}
};

RcppExport void R_init_alabaster_matrix(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ double lowest_double() {
}

//[[Rcpp::export(rng=false)]]
double max_double() {
double highest_double() {
return std::numeric_limits<double>::max();
}

Expand All @@ -27,7 +27,7 @@ Rcpp::List collect_double_attributes(Rcpp::NumericVector x) {
bool has_neginf = false;
{
for (auto y : x) {
if (!ISNA(y) && !std::isnan(y)) {
if (!ISNA(y) && std::isnan(y)) {
has_nan = true;
break;
}
Expand All @@ -48,7 +48,7 @@ Rcpp::List collect_double_attributes(Rcpp::NumericVector x) {

bool non_integer = true;
double minv = R_PosInf, maxv = R_NegInf;
if (has_nan || has_posinf || has_neginf) {
if (!has_nan && !has_posinf && !has_neginf) {
non_integer = false;
for (auto y : x) {
if (!ISNA(y)) {
Expand Down Expand Up @@ -79,7 +79,7 @@ Rcpp::List collect_double_attributes(Rcpp::NumericVector x) {
}
}

double highest = max_double();
double highest = highest_double();
for (auto y : x) {
if (!ISNA(y) && y == highest) {
has_highest = true;
Expand Down
Loading

0 comments on commit 4e79db0

Please sign in to comment.