Skip to content

Commit

Permalink
Migrate storage type optimization code from alabaster.matrix. (#27)
Browse files Browse the repository at this point in the history
This is a prelude to the migration of array-saving code from alabaster.matrix.
There are also some opportunities for storage optimization of atomic vectors 
but this requires some refactoring that I'll save for later.
  • Loading branch information
LTLA committed Sep 22, 2024
1 parent 7509a17 commit 1a37b0c
Show file tree
Hide file tree
Showing 8 changed files with 924 additions and 0 deletions.
12 changes: 12 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ export(anyMissing)
export(as.Rfc3339)
export(checkValidDirectory)
export(chooseMissingPlaceholderForHdf5)
export(collect_boolean_attributes)
export(collect_integer_attributes)
export(collect_number_attributes)
export(collect_string_attributes)
export(createRedirection)
export(customloadObjectHelper)
export(h5_cast)
Expand All @@ -67,6 +71,10 @@ export(loadDirectory)
export(loadObject)
export(missingPlaceholderName)
export(moveObject)
export(optimize_boolean_storage)
export(optimize_integer_storage)
export(optimize_number_storage)
export(optimize_string_storage)
export(processMcols)
export(processMetadata)
export(quickLoadObject)
Expand Down Expand Up @@ -106,6 +114,10 @@ export(validateObject)
export(writeMetadata)
exportMethods(acquireFile)
exportMethods(acquireMetadata)
exportMethods(collect_boolean_attributes)
exportMethods(collect_integer_attributes)
exportMethods(collect_number_attributes)
exportMethods(collect_string_attributes)
exportMethods(saveObject)
exportMethods(stageObject)
import(alabaster.schemas)
Expand Down
16 changes: 16 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,22 @@ choose_numeric_missing_placeholder <- function(x) {
.Call(`_alabaster_base_choose_numeric_missing_placeholder`, x)
}

collect_character_attributes <- function(x) {
.Call(`_alabaster_base_collect_character_attributes`, x)
}

lowest_double <- function() {
.Call(`_alabaster_base_lowest_double`)
}

highest_double <- function() {
.Call(`_alabaster_base_highest_double`)
}

collect_numeric_attributes <- function(x) {
.Call(`_alabaster_base_collect_numeric_attributes`, x)
}

not_rfc3339 <- function(x) {
.Call(`_alabaster_base_not_rfc3339`, x)
}
Expand Down
313 changes: 313 additions & 0 deletions R/optimize_storage.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
#' HDF5 storage type optimization
#'
#' Optimize HDF5 storage to use the smallest possible storage type.
#' Intended for \pkg{alabaster.*} developers only.
#'
#' @param x An atomic vector or array of the specified type,
#' e.g., integer vector/array for \code{collect_integer_attributes} and \code{optimize_integer_storage}.
#' Developers can also extend this to abstract objects containing values of the same type, e.g., matrix-like S4 classes.
#' @param fallback Function that accepts \code{x} and returns a suitable placeholder in the presence of \code{NA}s.
#' If \code{NULL}, this defaults to \code{\link{chooseMissingPlaceholderForHdf5}}.
#'
#' @return
#' For \code{collect_integer_attributes}, a named list containing:
#' \itemize{
#' \item \code{min}, the smallest non-\code{NA} value in \code{x}.
#' This is set to Inf if all values are \code{NA}.
#' \item \code{max}, the largest non-\code{NA} value in \code{x}.
#' This is set to -Inf if all values are \code{NA}.
#' \item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}.
#' }
#'
#' For \code{collect_number_attributes}, a named list containing:
#' \itemize{
#' \item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}.
#' \item \code{non_integer}, logical scalar indicating whether any values in \code{x} are non-integer.
#' \item \code{min}, the smallest non-\code{NA} integer value in \code{x}.
#' This is set to Inf if there are any non-integer or \code{NA} values.
#' \item \code{max}, the largest non-\code{NA} integer value in \code{x}.
#' This is set to -Inf if there are any non-integer or \code{NA} values.
#' \item \code{has_NaN}, logical scalar indicating whether NaN is present in \code{x}.
#' \item \code{has_Inf}, logical scalar indicating whether positive infinity is present in \code{x}.
#' \item \code{has_NegInf}, logical scalar indicating whether negative infinity is present in \code{x}.
#' \item \code{has_lowest}, logical scalar indicating whether the smallest double-precision value is present in \code{x}.
#' \item \code{has_highest}, logical scalar indicating whether the highest double-precision value is present in \code{x}.
#' }
#'
#' For \code{collect_string_attributes}, a named list containing:
#' \itemize{
#' \item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}.
#' \item \code{has_NA}, logical scalar indicating whether the \code{"NA"} string is present in \code{x}.
#' \item \code{has__NA}, logical scalar indicating whether the \code{"_NA"} string is present in \code{x}.
#' \item \code{max_len}, integer scalar specifying the maximum length of the strings in \code{x}.
#' }
#'
#' For \code{collect_boolean_attributes}, a named list containing:
#' \itemize{
#' \item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}.
#' }
#'
#' For the \code{optimize_*_storage} functions, a named list containing:
#' \itemize{
#' \item \code{type}, string containing the HDF5 datatype for storing \code{x}.
#' \item \code{placeholder}, value of the placeholder for \code{NA} values.
#' \item \code{other}, other attributes of \code{x} (e.g., number of non-zero elements for sparse vectors).
#' These should be stored in an \code{other} field in the named list returned by \code{collect_*_ attributes}.
#' }
#'
#' @author Aaron Lun
#' @aliases
#' collect_integer_attributes,integer-method
#' collect_integer_attributes,array-method
#' collect_number_attributes,double-method
#' collect_number_attributes,array-method
#' collect_string_attributes,character-method
#' collect_string_attributes,array-method
#' collect_boolean_attributes,logical-method
#' collect_boolean_attributes,array-method
#' @name optimize_storage
NULL

#' @export
#' @rdname optimize_storage
setGeneric("collect_integer_attributes", function(x) standardGeneric("collect_integer_attributes"))

.collect_integer_attributes_raw <- function(x) {
range <- suppressWarnings(range(x, na.rm=TRUE))
list(
min=range[1],
max=range[2],
missing=anyNA(x)
)
}

#' @export
setMethod("collect_integer_attributes", "integer", .collect_integer_attributes_raw)

#' @export
setMethod("collect_integer_attributes", "array", .collect_integer_attributes_raw)

#' @export
#' @rdname optimize_storage
optimize_integer_storage <- function(x) {
attr <- collect_integer_attributes(x)
lower <- attr$min
upper <- attr$max

if (attr$missing) {
# If it's infinite, that means that there are only missing values in
# 'x', otherwise there should have been at least one finite value
# available. In any case, it means we can just do whatever we want so
# we'll just use the smallest type.
if (is.infinite(lower)) {
return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7), other=attr$other))
}

if (lower < 0L) {
if (lower > -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7), other=attr$other))
} else if (lower > -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=as.integer(-2^15), other=attr$other))
}
} else {
if (upper < 2^8 - 1) {
return(list(type="H5T_NATIVE_UINT8", placeholder=as.integer(2^8-1), other=attr$other))
} else if (upper < 2^16 - 1) {
return(list(type="H5T_NATIVE_UINT16", placeholder=as.integer(2^16-1), other=attr$other))
}
}

return(list(type="H5T_NATIVE_INT32", placeholder=NA_integer_, other=attr$other))

} else {
# If it's infinite, that means that 'x' is of length zero, otherwise
# there should have been at least one finite value available. Here,
# the type doesn't matter, so we'll just use the smallest.
if (is.infinite(lower)) {
return(list(type="H5T_NATIVE_INT8", placeholder=NULL, other=attr$other))
}

if (lower < 0L) {
if (lower >= -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=NULL, other=attr$other))
} else if (lower >= -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=NULL, other=attr$other))
}
} else {
if (upper < 2^8) {
return(list(type="H5T_NATIVE_UINT8", placeholder=NULL, other=attr$other))
} else if (upper < 2^16) {
return(list(type="H5T_NATIVE_UINT16", placeholder=NULL, other=attr$other))
}
}

return(list(type="H5T_NATIVE_INT32", placeholder=NULL, other=attr$other))
}
}

#' @export
#' @rdname optimize_storage
setGeneric("collect_number_attributes", function(x) standardGeneric("collect_number_attributes"))

#' @export
setMethod("collect_number_attributes", "double", collect_numeric_attributes)

#' @export
setMethod("collect_number_attributes", "array", collect_numeric_attributes)

#' @export
#' @rdname optimize_storage
optimize_number_storage <- function(x, fallback = chooseMissingPlaceholderForHdf5) {
attr <- collect_number_attributes(x)
lower <- attr$min
upper <- attr$max

if (attr$missing) {
if (!attr$non_integer) {
if (lower < 0L) {
if (lower > -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=-2^7, other=attr$other))
} else if (lower > -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=-2^15, other=attr$other))
} else if (lower > -2^31 && upper < 2^31) {
return(list(type="H5T_NATIVE_INT32", placeholder=-2^31, other=attr$other))
}
} else {
if (upper < 2^8-1) {
return(list(type="H5T_NATIVE_UINT8", placeholder=2^8-1, other=attr$other))
} else if (upper < 2^16-1) {
return(list(type="H5T_NATIVE_UINT16", placeholder=2^16-1, other=attr$other))
} else if (upper < 2^32-1) {
return(list(type="H5T_NATIVE_UINT32", placeholder=2^32-1, other=attr$other))
}
}
}

placeholder <- NULL
if (!attr$has_NaN) {
placeholder <- NaN
} else if (!attr$has_Inf) {
placeholder <- Inf
} else if (!attr$has_NegInf) {
placeholder <- -Inf
} else if (!attr$has_lowest) {
placeholder <- lowest_double()
} else if (!attr$has_highest) {
placeholder <- highest_double()
}

if (is.null(placeholder)) {
if (is.null(fallback)) {
fallback <- chooseMissingPlaceholderForHdf5
}
placeholder <- fallback(x)
}

return(list(type="H5T_NATIVE_DOUBLE", placeholder=placeholder, other=attr$other))

} else {
if (!attr$non_integer) {
if (lower < 0L) {
if (lower >= -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=NULL, other=attr$other))
} else if (lower >= -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=NULL, other=attr$other))
} else if (lower >= -2^31 && upper < 2^31) {
return(list(type="H5T_NATIVE_INT32", placeholder=NULL, other=attr$other))
}
} else {
if (upper < 2^8) {
return(list(type="H5T_NATIVE_UINT8", placeholder=NULL, other=attr$other))
} else if (upper < 2^16) {
return(list(type="H5T_NATIVE_UINT16", placeholder=NULL, other=attr$other))
} else if (upper < 2^32) {
return(list(type="H5T_NATIVE_UINT32", placeholder=NULL, other=attr$other))
}
}
}

return(list(type="H5T_NATIVE_DOUBLE", placeholder=NULL, other=attr$other))
}
}

#' @export
#' @rdname optimize_storage
setGeneric("collect_string_attributes", function(x) standardGeneric("collect_string_attributes"))

.collect_string_attributes_raw <- function(x) {
attr <- collect_character_attributes(x)

# Unfortunately, we need to throw an error, because if we need to change
# the encoding (e.g., with enc2utf8), the maximum length of each string in
# bytes may no longer be correct due to changes of the multi-byte
# characters. So, coercions should be done before 'x' enters this function.
if (attr$has_native) {
info <- l10n_info()
if (!info[["UTF-8"]]) {
stop("detected natively encoded strings in a non-UTF-8 locale")
}
} else if (attr$has_non_utf8) {
stop("detected non-UTF-8-encoded strings")
}

attr$has_native <- NULL
attr$has_non_utf8 <- NULL
attr
}

#' @export
setMethod("collect_string_attributes", "character", .collect_string_attributes_raw)

#' @export
setMethod("collect_string_attributes", "array", .collect_string_attributes_raw)

#' @export
#' @rdname optimize_storage
optimize_string_storage <- function(x, fallback = NULL) {
attr <- collect_string_attributes(x)

placeholder <- NULL
if (attr$missing) {
if (!attr[["has_NA"]]) {
placeholder <- "NA"
} else if (!attr[["has__NA"]]) {
placeholder <- "_NA"
} else {
if (is.null(fallback)) {
fallback <- chooseMissingPlaceholderForHdf5
}
placeholder <- fallback(x)
}
attr$max_len <- max(attr$max_len, nchar(placeholder, "bytes"))
}

tid <- H5Tcopy("H5T_C_S1")
H5Tset_strpad(tid, strpad = "NULLPAD")
H5Tset_size(tid, max(1L, attr$max_len))
H5Tset_cset(tid, "UTF8")

list(type=tid, placeholder=placeholder, other=attr$other)
}

#' @export
#' @rdname optimize_storage
setGeneric("collect_boolean_attributes", function(x) standardGeneric("collect_boolean_attributes"))

.collect_boolean_attributes_raw <- function(x) list(missing=anyNA(x))

#' @export
setMethod("collect_boolean_attributes", "logical", .collect_boolean_attributes_raw)

#' @export
setMethod("collect_boolean_attributes", "array", .collect_boolean_attributes_raw)

#' @export
#' @rdname optimize_storage
optimize_boolean_storage <- function(x) {
attr <- collect_boolean_attributes(x)
placeholder <- NULL
if (attr$missing) {
placeholder <- -1L
}
list(type="H5T_NATIVE_INT8", placeholder=placeholder, other=attr$other)
}
Loading

0 comments on commit 1a37b0c

Please sign in to comment.