Support staging and loading of atomic vectors.

ArtifactDB · Sep 30, 2023 · b107fff · b107fff
1 parent 8d50faf
commit b107fff
Show file tree

Hide file tree

Showing 10 changed files with 362 additions and 32 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -30,6 +30,8 @@ export(createRedirection)
 export(customloadObjectHelper)
 export(listDirectory)
 export(listLocalObjects)
+export(loadAtomicVector)
+export(loadBaseFactor)
 export(loadBaseList)
 export(loadDataFrame)
 export(loadDataFrameFactor)

diff --git a/R/loadAtomicVector.R b/R/loadAtomicVector.R
@@ -0,0 +1,50 @@
+#' Load an atomic vector
+#'
+#' Load a simple vector consisting of atomic elements from file.
+#'
+#' @inheritParams loadDataFrame
+#' @param ... Further arguments, ignored.
+#'
+#' @return The vector described by \code{info}, possibly with names.
+#'
+#' @seealso
+#' \code{"\link{stageObject,integer-method}"}, for one of the staging methods.
+#'
+#' @author Aaron Lun
+#' 
+#' @examples
+#' tmp <- tempfile()
+#' dir.create(tmp)
+#' meta <- stageObject(setNames(runif(26), letters), tmp, path="bar")
+#' loadAtomicVector(meta, tmp)
+#' 
+#' @export
+loadAtomicVector <- function(info, project, ...) {
+    fpath <- acquireFile(project, info$path)
+    meta <- info$atomic_vector
+
+    df <- read.csv3(fpath, compression=meta$compression, nrows=meta$length)
+    output <- df[,ncol(df)]
+    type <- meta$type
+
+    if (type == "string") {
+        format <- meta$format
+        if (!is.null(format)) {
+            if (format == "date") {
+                output <- as.Date(output)
+            } else if (format == "date-time") {
+                output <- .cast_datetime(output)
+            } else {
+                output <- as.character(output)
+            }
+        }
+    } else {
+        stopifnot(.is_atomic(type))
+        output <- .cast_atomic(output, type)
+    }
+
+    if (isTRUE(meta$names)) {
+        names(output) <- df[,1]
+    }
+    output 
+}
diff --git a/R/loadDataFrame.R b/R/loadDataFrame.R
@@ -98,7 +98,6 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {
 
     # Make sure everyone is of the right type.
     new.names <- character(ncol(df))
-    atomics <- c(integer="integer", number="double", string="character", boolean="logical")
 
     for (i in seq_along(col.info)) {
         new.names[i] <- col.info[[i]]$name
@@ -114,10 +113,10 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {
 
         } else if (col.type=="date-time") {
             # Remove colon in the timezone, which confuses as.POSIXct().
-            df[[i]] <- as.POSIXct(sub(":([0-9]{2})$", "\\1", df[[i]]), format="%Y-%m-%dT%H:%M:%S%z")
+            df[[i]] <- .cast_datetime(df[[i]])
 
-        } else if (col.type %in% names(atomics)) {
-            df[[i]] <- as(df[[i]], as.character(atomics[col.type]))
+        } else if (.is_atomic(col.type)) {
+            df[[i]] <- .cast_atomic(df[[i]], col.type)
 
         } else if (col.type == "other") {
             current <- acquireMetadata(project, col.info[[i]]$resource$path)

diff --git a/R/stageAtomicVector.R b/R/stageAtomicVector.R
@@ -0,0 +1,103 @@
+#' @importFrom S4Vectors DataFrame
+.stage_atomic_vector <- function(x, dir, path, child=FALSE, ...) {
+    dir.create(file.path(dir, path), showWarnings=FALSE)
+    new_path <- paste0(path, "/simple.txt.gz")
+
+    if (.is_datetime(x)) {
+        type <- "string"
+        format <- "date-time"
+        contents <- .sanitize_datetime(x)
+
+    } else if (is(x, "Date")) {
+        type <- "string"
+        format <- "date"
+        contents <- .sanitize_date(x)
+
+    } else {
+        stopifnot(is.atomic(x))
+        remapped <- .remap_atomic_type(x)
+        type <- remapped$type
+        contents <- remapped$values
+        format <- NULL
+    }
+
+    mock <- DataFrame(values=contents)
+    if (!is.null(names(x))) {
+        mock <- cbind(names=names(x), mock)
+    }
+    quickWriteCsv(mock, file.path(dir, new_path), row.names=FALSE, compression="gzip")
+
+    list(
+        `$schema` = "atomic_vector/v1.json",
+        path = new_path,
+        is_child = child,
+        atomic_vector = list(
+            type = type,
+            length = length(contents),
+            names = !is.null(names(x)),
+            format = format,
+            compression="gzip"
+        )
+    )
+}
+
+#' Stage atomic vectors
+#'
+#' Stage vectors containing atomic elements (or values that can be cast as such, e.g., dates and times).
+#'
+#' @param x Any of the atomic vector types, or \link{Date} objects, or time objects, e.g., \link{POSIXct}.
+#' @inheritParams stageObject
+#' @param ... Further arguments that are ignored.
+#' 
+#' @return
+#' A named list containing the metadata for \code{x}.
+#' \code{x} itself is written to a CSV file inside \code{path}.
+#'
+#' @details 
+#' Dates and POSIX times are cast to strings;
+#' the type itself is recorded in the metadata.
+#'
+#' @author Aaron Lun
+#' 
+#' @examples
+#' tmp <- tempfile()
+#' dir.create(tmp)
+#' stageObject(LETTERS, tmp, path="foo")
+#' stageObject(setNames(runif(26), letters), tmp, path="bar")
+#'
+#' list.files(tmp, recursive=TRUE)
+#' 
+#' @name stageAtomicVector
+NULL
+
+#' @export
+#' @rdname stageAtomicVector
+setMethod("stageObject", "integer", .stage_atomic_vector)
+
+#' @export
+#' @rdname stageAtomicVector
+setMethod("stageObject", "character", .stage_atomic_vector)
+
+#' @export
+#' @rdname stageAtomicVector
+setMethod("stageObject", "logical", .stage_atomic_vector)
+
+#' @export
+#' @rdname stageAtomicVector
+setMethod("stageObject", "double", .stage_atomic_vector)
+
+#' @export
+#' @rdname stageAtomicVector
+setMethod("stageObject", "numeric", .stage_atomic_vector)
+
+#' @export
+#' @rdname stageAtomicVector
+setMethod("stageObject", "Date", .stage_atomic_vector)
+
+#' @export
+#' @rdname stageAtomicVector
+setMethod("stageObject", "POSIXlt", .stage_atomic_vector)
+
+#' @export
+#' @rdname stageAtomicVector
+setMethod("stageObject", "POSIXct", .stage_atomic_vector)
diff --git a/R/stageBaseList.R b/R/stageBaseList.R
@@ -151,7 +151,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list
             return(NULL)
 
         } else if (is.atomic(x)) {
-            coerced <- .remap_type(x)
+            coerced <- .remap_atomic_type(x)
 
             .label_hdf5_group(fpath, name, 
                 uzuki_object="vector",
@@ -191,7 +191,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list
 .is_stringlike <- function(x) {
     if (is(x, "Date")) {
         return("date")
-    } else if (is(x, "POSIXct") || is(x, "POSIXlt")) {
+    } else if (.is_datetime(x)) {
         return("date-time")
     } else if (is.character(x)) {
         return("string")
@@ -283,7 +283,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list
             return(formatted)
 
         } else if (is.atomic(x)) {
-            formatted <- .remap_type(x)
+            formatted <- .remap_atomic_type(x)
 
             y <- formatted$values
             if (is.numeric(y) && !all(is.finite(y))) {

diff --git a/R/stageDataFrame.R b/R/stageDataFrame.R
@@ -112,19 +112,16 @@ setMethod("stageObject", "DataFrame", function(x, dir, path, child=FALSE, df.nam
 
                 x[[z]] <- as.character(col)
 
-            } else if (is.character(col)) {
-                out$type <- "string"
-
-            } else if (is(col, "POSIXct") || is(col, "POSIXlt")) {
+            } else if (.is_datetime(col)) {
                 out$type <- "date-time"
                 x[[z]] <- .sanitize_datetime(col)
 
             } else if (is(col, "Date")) {
                 out$type <- "date"
-                x[[z]] <- format(col, "%Y-%m-%d")
+                x[[z]] <- .sanitize_date(col)
 
             } else if (is.atomic(col)) {
-                coerced <- .remap_type(col)
+                coerced <- .remap_atomic_type(col)
                 out$type <- coerced$type
                 x[[z]] <- coerced$values
 
@@ -208,25 +205,6 @@ setMethod("stageObject", "DataFrame", function(x, dir, path, child=FALSE, df.nam
     meta
 })
 
-.sanitize_datetime <- function(x) {
-    sub("([0-9]{2})$", ":\\1", strftime(x, "%Y-%m-%dT%H:%M:%S%z"))
-}
-
-.remap_type <- function(x) {
-    y <- typeof(x)
-
-    # Forcibly coercing the types, just to make sure that
-    # we don't get tricked by classes that might do something
-    # different inside write.csv or whatever.
-    switch(y,
-        integer=list(type="integer", values=as.integer(x)),
-        double=list(type="number", values=as.double(x)),
-        numeric=list(type="number", values=as.double(x)),
-        logical=list(type="boolean", values=as.logical(x)),
-        stop("type '", y, "' is not supported")
-    )
-}
-
 #' @importFrom rhdf5 h5write h5createGroup h5createFile
 .write_hdf5_data_frame <- function(x, skippable, host, ofile) {
     h5createFile(ofile)

diff --git a/R/utils.R b/R/utils.R
@@ -0,0 +1,42 @@
+.sanitize_date <- function(x) {
+    format(x, "%Y-%m-%d")
+}
+
+.sanitize_datetime <- function(x) {
+    sub("([0-9]{2})$", ":\\1", strftime(x, "%Y-%m-%dT%H:%M:%S%z"))
+}
+
+.is_datetime <- function(x) {
+    is(x, "POSIXct") || is(x, "POSIXlt")
+}
+
+.remap_atomic_type <- function(x) {
+    y <- typeof(x)
+
+    # Forcibly coercing the types, just to make sure that
+    # we don't get tricked by classes that might do something
+    # different inside write.csv or whatever.
+    switch(y,
+        integer=list(type="integer", values=as.integer(x)),
+        double=list(type="number", values=as.double(x)),
+        numeric=list(type="number", values=as.double(x)),
+        logical=list(type="boolean", values=as.logical(x)),
+        character=list(type="string", values=as.character(x)),
+        stop("type '", y, "' is not supported")
+    )
+}
+
+.cast_datetime <- function(x) {
+    # Remove colon in the timezone, which confuses as.POSIXct().
+    as.POSIXct(sub(":([0-9]{2})$", "\\1", x), format="%Y-%m-%dT%H:%M:%S%z")
+}
+
+.atomics <- c(integer="integer", number="double", string="character", boolean="logical")
+
+.is_atomic <- function(type) {
+    type %in% names(.atomics)
+}
+
+.cast_atomic <- function(x, type) {
+    as(x, as.character(.atomics[type]))
+}
diff --git a/man/loadAtomicVector.Rd b/man/loadAtomicVector.Rd