Skip to content

Commit

Permalink
Switched the array savers/loaders to the simplified approach. (#5)
Browse files Browse the repository at this point in the history
This eliminates the dependence on schemas and aligns with the C++
validators implemented in takane (and compiled in alabaster.base).
  • Loading branch information
LTLA committed Nov 27, 2023
1 parent e35dbda commit 5ca5c17
Show file tree
Hide file tree
Showing 10 changed files with 330 additions and 192 deletions.
7 changes: 6 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(extractComponents)
export(loadArray)
export(loadWrapperArray)
export(preserveDelayedOperations)
export(readArray)
export(recycleHdf5Files)
export(writeSparseMatrix)
exportClasses(AmalgamatedArray)
Expand All @@ -27,8 +28,11 @@ exportMethods(extract_sparse_array)
exportMethods(is_sparse)
exportMethods(matrixClass)
exportMethods(path)
exportMethods(saveObject)
exportMethods(stageObject)
import(alabaster.base)
import(methods)
import(rhdf5)
importClassesFrom(DelayedArray,DelayedAbind)
importClassesFrom(DelayedArray,DelayedArray)
importClassesFrom(DelayedArray,DelayedMatrix)
Expand All @@ -39,6 +43,7 @@ importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,dsparseMatrix)
importClassesFrom(SparseArray,SVT_SparseMatrix)
importFrom(BiocGenerics,path)
importFrom(DelayedArray,"type<-")
importFrom(DelayedArray,DelayedArray)
importFrom(DelayedArray,OLD_extract_sparse_array)
importFrom(DelayedArray,acbind)
Expand All @@ -59,6 +64,7 @@ importFrom(DelayedArray,read_sparse_block)
importFrom(DelayedArray,rowAutoGrid)
importFrom(DelayedArray,type)
importFrom(HDF5Array,H5SparseMatrixSeed)
importFrom(HDF5Array,HDF5Array)
importFrom(HDF5Array,HDF5ArraySeed)
importFrom(HDF5Array,writeHDF5Array)
importFrom(S4Vectors,new2)
Expand All @@ -70,7 +76,6 @@ importFrom(alabaster.base,acquireFile)
importFrom(alabaster.base,acquireMetadata)
importFrom(alabaster.base,addMissingPlaceholderAttributeForHdf5)
importFrom(alabaster.base,loadObject)
importFrom(alabaster.base,stageObject)
importFrom(alabaster.base,transformVectorForHdf5)
importFrom(rhdf5,H5Fclose)
importFrom(rhdf5,H5Fopen)
Expand Down
34 changes: 0 additions & 34 deletions R/loadArray.R

This file was deleted.

85 changes: 85 additions & 0 deletions R/readArray.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#' Read a dense array from disk
#'
#' Read a dense high-dimensional array from its on-disk representation.
#'
#' @param path String containing a path to a directory, itself created by the \code{\link{saveObject}} method for a dense array.
#' @param array.file.backed Logical scalar indicating whether to return a file-backed S4 array class, or an ordinary R array in memory.
#' @param ... Further arguments, ignored.
#'
#' @return A multi-dimensional array-like object, either a \linkS4class{DelayedArray} or an ordinary R array.
#'
#' @seealso
#' \code{"\link{saveObject,array-method}"}, to create the directory and its contents.
#'
#' @author Aaron Lun
#'
#' @examples
#' arr <- array(rpois(10000, 10), c(50, 20, 10))
#' dimnames(arr) <- list(
#' paste0("GENE_", seq_len(nrow(arr))),
#' letters[1:20],
#' NULL
#' )
#'
#' dir <- tempfile()
#' saveObject(arr, dir)
#' readArray(dir)
#'
#' @export
#' @importFrom HDF5Array HDF5Array
#' @importFrom DelayedArray type<-
readArray <- function(path, array.file.backed=TRUE, ...) {
fpath <- file.path(path, "array.h5")

details <- local({
fhandle <- H5Fopen(fpath)
on.exit(H5Fclose(fhandle), add=TRUE, after=FALSE)
ghandle <- H5Gopen(fhandle, "dense_array")
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)

type <- h5_read_attribute(ghandle, "type")
transposed <- h5_read_attribute(ghandle, "transposed", check=TRUE, default=0L)

dhandle <- H5Dopen(ghandle, "data")
on.exit(H5Dclose(dhandle), add=TRUE, after=FALSE)
placeholder <- h5_read_attribute(dhandle, "missing-value-placeholder", check=TRUE, default=NULL)

ndims <- length(H5Sget_simple_extent_dims(H5Dget_space(dhandle)))
names <- load_names(ghandle, ndims)

list(type=type, names=names, transposed=(transposed != 0L), placeholder=placeholder)
})

out <- HDF5Array(filepath=fpath, name="dense_array/data")
if (type(out) == "raw") { # ... so that placeholders are correctly substituted.
type(out) <- "integer"
}

if (!is.null(details$names)) {
dimnames(out) <- rev(details$names)
}
if (!details$transposed) { # All R-based HDF5 bindings will automatically transpose, so we need to untranspose if `transposed=FALSE`.
out <- t(out)
}
if (!is.null(details$placeholder)) {
out <- DelayedMask(out, placeholder=details$placeholder)
out <- DelayedArray(out)
}
type(out) <- from_array_type(details$type)

if (!array.file.backed) {
return(as.array(out))
} else {
return(DelayedArray(out))
}
}

##############################
######### OLD STUFF ##########
##############################

#' @export
loadArray <- function(info, project) {
seed <- .createRawArraySeed(info, project=project, names=TRUE)
DelayedArray(seed)
}
105 changes: 66 additions & 39 deletions R/stageArray.R → R/saveArray.R
Original file line number Diff line number Diff line change
@@ -1,58 +1,89 @@
#' Stage a multi-dimensional array for upload
#' Save a multi-dimensional array to disk
#'
#' Stage a high-dimensional array in preparation for upload to DataSetDB.
#' Save a high-dimensional array to its on-disk representations.
#'
#' @param x An array, almost always integer or numeric, though logical and character matrices are also supported.
#' Alternatively, a \linkS4class{DelayedArray} or any instance of a \linkS4class{Matrix} class.
#' @param dir String containing the path to the staging directory.
#' @param path String containing the relative path to a subdirectory inside the staging directory, in which \code{x} is to be saved.
#' @param child Logical scalar indicating whether \code{x} is a child of a larger object.
#' @param x An integer, numeric, logical or character array.
#' Alternatively, any of the \linkS4class{denseMatrix} subclasses from the \pkg{Matrix} package.
#' @param path String containing the path to a directory in which to save \code{x}.
#' @param ... Further arguments, currently ignored.
#'
#' @return
#' \code{x} is saved into a single file at \code{file.path(dir, path)}, possibly after appending an arbitrary file extension.
#' A named list is returned, containing at least:
#' \itemize{
#' \item \code{$schema}, a string specifying the schema to use to validate the metadata.
#' \item \code{path}, a string containing the path to the file inside the subdirectory, containing the assay contents.
#' \item \code{is_child}, a logical scalar equal to the input \code{child}.
#' }
#'
#' @details
#' For dense arrays, we save the array as a dense matrix in a HDF5 file using methods from the \pkg{HDF5Array} package.
#' For sparse matrices, we call \code{\link{writeSparseMatrix}} to save the data in the 10X sparse matrix format.
#' Other representations may have more appropriate formats, which are supported by simply writing new methods for this generic.
#' Note that specialized methods will usually require new schemas to validate any new metadata fields.
#'
#' If \code{x} itself is a child of a larger object, we suggest using the output \code{path} when referencing \code{x} from within the larger object's metadata.
#' This is because \code{stageObject} methods may add more path components, file extensions, etc. to the input \code{path} when saving the object.
#' As a result, the output \code{path} may not be the same as the input \code{path}.
#' \code{x} is saved to \code{path} and \code{NULL} is invisibly returned.
#'
#' @seealso
#' \code{\link{preserveDelayedOperations}}, to preserve the delayed'ness of a \linkS4class{DelayedMatrix} \code{x}.
#'
#' \code{\link{recycleHdf5Files}}, to re-use the existing file in a HDF5-backed \linkS4class{DelayedMatrix} \code{x}.
#' \code{\link{readArray}}, to read the directory contents back into the R session.
#'
#' @author Aaron Lun
#' @examples
#' dir <- tempfile()
#' dir.create(dir)
#'
#' mat <- array(rpois(10000, 10), c(50, 20, 10))
#' dimnames(mat) <- list(
#' paste0("GENE_", seq_len(nrow(mat))),
#' letters[1:20],
#' NULL
#' )
#'
#' path <- "whee"
#' stageObject(mat, dir, path)
#'
#' dir <- tempfile()
#' saveObject(mat, dir)
#' list.files(dir)
#'
#' @name stageArray
#' @importFrom alabaster.base stageObject
#' @name saveArray
#' @aliases
#' stageObject,array-method
#' stageObject,Matrix-method
NULL

#' @import alabaster.base rhdf5
.save_array <- function(x, path, ...) {
dir.create(path)
fpath <- file.path(path, "array.h5")
name <- "dense_array"

# This needs to be wrapped up as writeHDF5Array needs to
# take ownership of the file handle internally.
local({
fhandle <- H5Fcreate(fpath, "H5F_ACC_TRUNC")
on.exit(H5Fclose(fhandle), add=TRUE, after=FALSE)

ghandle <- H5Gcreate(fhandle, name)
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)

h5_write_attribute(ghandle, "version", "1.0", scalar=TRUE)
h5_write_attribute(ghandle, "type", array_type(x), scalar=TRUE)
h5_write_attribute(ghandle, "transposed", 1L, scalar=TRUE)
})

transformed <- transformVectorForHdf5(x)
writeHDF5Array(transformed$transformed, filepath=fpath, name="dense_array/data")

# Reinitializing the handle for more downstream work.
fhandle <- H5Fopen(fpath, "H5F_ACC_RDWR")
on.exit(H5Fclose(fhandle), add=TRUE, after=FALSE)
ghandle <- H5Gopen(fhandle, name)
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)

if (!is.null(transformed$placeholder)) {
dhandle <- H5Dopen(ghandle, "data")
on.exit(H5Dclose(dhandle), add=TRUE, after=FALSE)
h5_write_attribute(dhandle, missingPlaceholderName, transformed$placeholder, scalar=TRUE)
}

save_names(ghandle, x, transpose=TRUE)
write(name, file=file.path(path, "OBJECT"))
invisible(NULL)
}

#' @export
#' @rdname saveArray
setMethod("saveObject", "array", .save_array)

#' @export
#' @rdname saveArray
setMethod("saveObject", "denseMatrix", .save_array)

##############################
######### OLD STUFF ##########
##############################

#' @importFrom DelayedArray is_sparse
#' @importFrom rhdf5 h5createFile
#' @importFrom HDF5Array writeHDF5Array
Expand Down Expand Up @@ -111,7 +142,6 @@ NULL
}

#' @export
#' @rdname stageArray
setMethod("stageObject", "array", function(x, dir, path, child=FALSE) .stage_array(x, dir, path, child=child))

#' @importFrom alabaster.base .stageObject
Expand Down Expand Up @@ -141,7 +171,6 @@ setMethod("stageObject", "array", function(x, dir, path, child=FALSE) .stage_arr
}

#' @export
#' @rdname stageArray
setMethod("stageObject", "DelayedArray", function(x, dir, path, child=FALSE) .stage_delayed(x, dir, path, child = child, fallback = .stage_array))

#' @importFrom rhdf5 h5createFile h5createGroup
Expand Down Expand Up @@ -179,12 +208,10 @@ setMethod("stageObject", "DelayedArray", function(x, dir, path, child=FALSE) .st
}

#' @export
#' @rdname stageArray
#' @importClassesFrom Matrix Matrix
setMethod("stageObject", "Matrix", function(x, dir, path, child=FALSE) .stage_any_matrix(x, dir, path, child=child))

#' @export
#' @rdname stageArray
setMethod("stageObject", "DelayedMatrix", function(x, dir, path, child=FALSE) .stage_delayed(x, dir, path, child = child, fallback = .stage_any_matrix))

.link_or_copy <- function(from, to) {
Expand Down
50 changes: 49 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#' @importFrom DelayedArray type
array_type <- function(x) {
to_array_type <- function(x) {
switch(type(x),
integer="integer",
double="number",
Expand All @@ -8,3 +8,51 @@ array_type <- function(x) {
"other"
)
}

array_type <- to_array_type

from_array_type <- function(x) {
switch(x,
integer="integer",
number="double",
boolean="logical",
string="character"
)
}

save_names <- function(handle, x, group = "names", transpose=FALSE) {
d <- dimnames(x)
if (is.null(d) || all(vapply(d, is.null, TRUE))) {
return(NULL)
}

if (transpose) { # for the HDF5 array transposition.
d <- rev(d)
}

ghandle <- H5Gcreate(handle, group)
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)
for (i in seq_along(d)) {
current <- d[[i]]
if (!is.null(current)) {
h5_write_vector(ghandle, as.character(i - 1L), current)
}
}
}

load_names <- function(handle, ndim, group = "names") {
if (!h5_object_exists(handle, group)) {
return(NULL)
}

ghandle <- H5Gopen(handle, group)
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)
all.named <- h5ls(ghandle, datasetinfo=FALSE, recursive=FALSE)

names <- vector("list", ndim)
for (y in all.named$name) {
names[[as.integer(y) + 1L]] <- h5_read_vector(ghandle, y)
}

names
}
Loading

0 comments on commit 5ca5c17

Please sign in to comment.