From b107ffffc58d403f2ab4aac3031f1c148577d98c Mon Sep 17 00:00:00 2001 From: LTLA Date: Sat, 30 Sep 2023 13:11:21 -0700 Subject: [PATCH] Support staging and loading of atomic vectors. --- NAMESPACE | 2 + R/loadAtomicVector.R | 50 +++++++++++++++++ R/loadDataFrame.R | 7 +-- R/stageAtomicVector.R | 103 +++++++++++++++++++++++++++++++++++ R/stageBaseList.R | 6 +- R/stageDataFrame.R | 28 +--------- R/utils.R | 42 ++++++++++++++ man/loadAtomicVector.Rd | 35 ++++++++++++ man/stageAtomicVector.Rd | 65 ++++++++++++++++++++++ tests/testthat/test-vector.R | 56 +++++++++++++++++++ 10 files changed, 362 insertions(+), 32 deletions(-) create mode 100644 R/loadAtomicVector.R create mode 100644 R/stageAtomicVector.R create mode 100644 R/utils.R create mode 100644 man/loadAtomicVector.Rd create mode 100644 man/stageAtomicVector.Rd create mode 100644 tests/testthat/test-vector.R diff --git a/NAMESPACE b/NAMESPACE index 7f1de3e..0dbbb1b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,6 +30,8 @@ export(createRedirection) export(customloadObjectHelper) export(listDirectory) export(listLocalObjects) +export(loadAtomicVector) +export(loadBaseFactor) export(loadBaseList) export(loadDataFrame) export(loadDataFrameFactor) diff --git a/R/loadAtomicVector.R b/R/loadAtomicVector.R new file mode 100644 index 0000000..b253885 --- /dev/null +++ b/R/loadAtomicVector.R @@ -0,0 +1,50 @@ +#' Load an atomic vector +#' +#' Load a simple vector consisting of atomic elements from file. +#' +#' @inheritParams loadDataFrame +#' @param ... Further arguments, ignored. +#' +#' @return The vector described by \code{info}, possibly with names. +#' +#' @seealso +#' \code{"\link{stageObject,integer-method}"}, for one of the staging methods. +#' +#' @author Aaron Lun +#' +#' @examples +#' tmp <- tempfile() +#' dir.create(tmp) +#' meta <- stageObject(setNames(runif(26), letters), tmp, path="bar") +#' loadAtomicVector(meta, tmp) +#' +#' @export +loadAtomicVector <- function(info, project, ...) { + fpath <- acquireFile(project, info$path) + meta <- info$atomic_vector + + df <- read.csv3(fpath, compression=meta$compression, nrows=meta$length) + output <- df[,ncol(df)] + type <- meta$type + + if (type == "string") { + format <- meta$format + if (!is.null(format)) { + if (format == "date") { + output <- as.Date(output) + } else if (format == "date-time") { + output <- .cast_datetime(output) + } else { + output <- as.character(output) + } + } + } else { + stopifnot(.is_atomic(type)) + output <- .cast_atomic(output, type) + } + + if (isTRUE(meta$names)) { + names(output) <- df[,1] + } + output +} diff --git a/R/loadDataFrame.R b/R/loadDataFrame.R index 8625d49..8a69d79 100644 --- a/R/loadDataFrame.R +++ b/R/loadDataFrame.R @@ -98,7 +98,6 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) { # Make sure everyone is of the right type. new.names <- character(ncol(df)) - atomics <- c(integer="integer", number="double", string="character", boolean="logical") for (i in seq_along(col.info)) { new.names[i] <- col.info[[i]]$name @@ -114,10 +113,10 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) { } else if (col.type=="date-time") { # Remove colon in the timezone, which confuses as.POSIXct(). - df[[i]] <- as.POSIXct(sub(":([0-9]{2})$", "\\1", df[[i]]), format="%Y-%m-%dT%H:%M:%S%z") + df[[i]] <- .cast_datetime(df[[i]]) - } else if (col.type %in% names(atomics)) { - df[[i]] <- as(df[[i]], as.character(atomics[col.type])) + } else if (.is_atomic(col.type)) { + df[[i]] <- .cast_atomic(df[[i]], col.type) } else if (col.type == "other") { current <- acquireMetadata(project, col.info[[i]]$resource$path) diff --git a/R/stageAtomicVector.R b/R/stageAtomicVector.R new file mode 100644 index 0000000..d096952 --- /dev/null +++ b/R/stageAtomicVector.R @@ -0,0 +1,103 @@ +#' @importFrom S4Vectors DataFrame +.stage_atomic_vector <- function(x, dir, path, child=FALSE, ...) { + dir.create(file.path(dir, path), showWarnings=FALSE) + new_path <- paste0(path, "/simple.txt.gz") + + if (.is_datetime(x)) { + type <- "string" + format <- "date-time" + contents <- .sanitize_datetime(x) + + } else if (is(x, "Date")) { + type <- "string" + format <- "date" + contents <- .sanitize_date(x) + + } else { + stopifnot(is.atomic(x)) + remapped <- .remap_atomic_type(x) + type <- remapped$type + contents <- remapped$values + format <- NULL + } + + mock <- DataFrame(values=contents) + if (!is.null(names(x))) { + mock <- cbind(names=names(x), mock) + } + quickWriteCsv(mock, file.path(dir, new_path), row.names=FALSE, compression="gzip") + + list( + `$schema` = "atomic_vector/v1.json", + path = new_path, + is_child = child, + atomic_vector = list( + type = type, + length = length(contents), + names = !is.null(names(x)), + format = format, + compression="gzip" + ) + ) +} + +#' Stage atomic vectors +#' +#' Stage vectors containing atomic elements (or values that can be cast as such, e.g., dates and times). +#' +#' @param x Any of the atomic vector types, or \link{Date} objects, or time objects, e.g., \link{POSIXct}. +#' @inheritParams stageObject +#' @param ... Further arguments that are ignored. +#' +#' @return +#' A named list containing the metadata for \code{x}. +#' \code{x} itself is written to a CSV file inside \code{path}. +#' +#' @details +#' Dates and POSIX times are cast to strings; +#' the type itself is recorded in the metadata. +#' +#' @author Aaron Lun +#' +#' @examples +#' tmp <- tempfile() +#' dir.create(tmp) +#' stageObject(LETTERS, tmp, path="foo") +#' stageObject(setNames(runif(26), letters), tmp, path="bar") +#' +#' list.files(tmp, recursive=TRUE) +#' +#' @name stageAtomicVector +NULL + +#' @export +#' @rdname stageAtomicVector +setMethod("stageObject", "integer", .stage_atomic_vector) + +#' @export +#' @rdname stageAtomicVector +setMethod("stageObject", "character", .stage_atomic_vector) + +#' @export +#' @rdname stageAtomicVector +setMethod("stageObject", "logical", .stage_atomic_vector) + +#' @export +#' @rdname stageAtomicVector +setMethod("stageObject", "double", .stage_atomic_vector) + +#' @export +#' @rdname stageAtomicVector +setMethod("stageObject", "numeric", .stage_atomic_vector) + +#' @export +#' @rdname stageAtomicVector +setMethod("stageObject", "Date", .stage_atomic_vector) + +#' @export +#' @rdname stageAtomicVector +setMethod("stageObject", "POSIXlt", .stage_atomic_vector) + +#' @export +#' @rdname stageAtomicVector +setMethod("stageObject", "POSIXct", .stage_atomic_vector) diff --git a/R/stageBaseList.R b/R/stageBaseList.R index 576f215..043a202 100644 --- a/R/stageBaseList.R +++ b/R/stageBaseList.R @@ -151,7 +151,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list return(NULL) } else if (is.atomic(x)) { - coerced <- .remap_type(x) + coerced <- .remap_atomic_type(x) .label_hdf5_group(fpath, name, uzuki_object="vector", @@ -191,7 +191,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list .is_stringlike <- function(x) { if (is(x, "Date")) { return("date") - } else if (is(x, "POSIXct") || is(x, "POSIXlt")) { + } else if (.is_datetime(x)) { return("date-time") } else if (is.character(x)) { return("string") @@ -283,7 +283,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list return(formatted) } else if (is.atomic(x)) { - formatted <- .remap_type(x) + formatted <- .remap_atomic_type(x) y <- formatted$values if (is.numeric(y) && !all(is.finite(y))) { diff --git a/R/stageDataFrame.R b/R/stageDataFrame.R index 6958958..5f5518a 100644 --- a/R/stageDataFrame.R +++ b/R/stageDataFrame.R @@ -112,19 +112,16 @@ setMethod("stageObject", "DataFrame", function(x, dir, path, child=FALSE, df.nam x[[z]] <- as.character(col) - } else if (is.character(col)) { - out$type <- "string" - - } else if (is(col, "POSIXct") || is(col, "POSIXlt")) { + } else if (.is_datetime(col)) { out$type <- "date-time" x[[z]] <- .sanitize_datetime(col) } else if (is(col, "Date")) { out$type <- "date" - x[[z]] <- format(col, "%Y-%m-%d") + x[[z]] <- .sanitize_date(col) } else if (is.atomic(col)) { - coerced <- .remap_type(col) + coerced <- .remap_atomic_type(col) out$type <- coerced$type x[[z]] <- coerced$values @@ -208,25 +205,6 @@ setMethod("stageObject", "DataFrame", function(x, dir, path, child=FALSE, df.nam meta }) -.sanitize_datetime <- function(x) { - sub("([0-9]{2})$", ":\\1", strftime(x, "%Y-%m-%dT%H:%M:%S%z")) -} - -.remap_type <- function(x) { - y <- typeof(x) - - # Forcibly coercing the types, just to make sure that - # we don't get tricked by classes that might do something - # different inside write.csv or whatever. - switch(y, - integer=list(type="integer", values=as.integer(x)), - double=list(type="number", values=as.double(x)), - numeric=list(type="number", values=as.double(x)), - logical=list(type="boolean", values=as.logical(x)), - stop("type '", y, "' is not supported") - ) -} - #' @importFrom rhdf5 h5write h5createGroup h5createFile .write_hdf5_data_frame <- function(x, skippable, host, ofile) { h5createFile(ofile) diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..9e8f679 --- /dev/null +++ b/R/utils.R @@ -0,0 +1,42 @@ +.sanitize_date <- function(x) { + format(x, "%Y-%m-%d") +} + +.sanitize_datetime <- function(x) { + sub("([0-9]{2})$", ":\\1", strftime(x, "%Y-%m-%dT%H:%M:%S%z")) +} + +.is_datetime <- function(x) { + is(x, "POSIXct") || is(x, "POSIXlt") +} + +.remap_atomic_type <- function(x) { + y <- typeof(x) + + # Forcibly coercing the types, just to make sure that + # we don't get tricked by classes that might do something + # different inside write.csv or whatever. + switch(y, + integer=list(type="integer", values=as.integer(x)), + double=list(type="number", values=as.double(x)), + numeric=list(type="number", values=as.double(x)), + logical=list(type="boolean", values=as.logical(x)), + character=list(type="string", values=as.character(x)), + stop("type '", y, "' is not supported") + ) +} + +.cast_datetime <- function(x) { + # Remove colon in the timezone, which confuses as.POSIXct(). + as.POSIXct(sub(":([0-9]{2})$", "\\1", x), format="%Y-%m-%dT%H:%M:%S%z") +} + +.atomics <- c(integer="integer", number="double", string="character", boolean="logical") + +.is_atomic <- function(type) { + type %in% names(.atomics) +} + +.cast_atomic <- function(x, type) { + as(x, as.character(.atomics[type])) +} diff --git a/man/loadAtomicVector.Rd b/man/loadAtomicVector.Rd new file mode 100644 index 0000000..73c0dd9 --- /dev/null +++ b/man/loadAtomicVector.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/loadAtomicVector.R +\name{loadAtomicVector} +\alias{loadAtomicVector} +\title{Load an atomic vector} +\usage{ +loadAtomicVector(info, project, ...) +} +\arguments{ +\item{info}{Named list containing the metadata for this object.} + +\item{project}{Any argument accepted by the acquisition functions, see \code{?\link{acquireFile}}. +By default, this should be a string containing the path to a staging directory.} + +\item{...}{Further arguments, ignored.} +} +\value{ +The vector described by \code{info}, possibly with names. +} +\description{ +Load a simple vector consisting of atomic elements from file. +} +\examples{ +tmp <- tempfile() +dir.create(tmp) +meta <- stageObject(setNames(runif(26), letters), tmp, path="bar") +loadAtomicVector(meta, tmp) + +} +\seealso{ +\code{"\link{stageObject,integer-method}"}, for one of the staging methods. +} +\author{ +Aaron Lun +} diff --git a/man/stageAtomicVector.Rd b/man/stageAtomicVector.Rd new file mode 100644 index 0000000..1331f92 --- /dev/null +++ b/man/stageAtomicVector.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stageAtomicVector.R +\name{stageAtomicVector} +\alias{stageAtomicVector} +\alias{stageObject,integer-method} +\alias{stageObject,character-method} +\alias{stageObject,logical-method} +\alias{stageObject,double-method} +\alias{stageObject,numeric-method} +\alias{stageObject,Date-method} +\alias{stageObject,POSIXlt-method} +\alias{stageObject,POSIXct-method} +\title{Stage atomic vectors} +\usage{ +\S4method{stageObject}{integer}(x, dir, path, child = FALSE, ...) + +\S4method{stageObject}{character}(x, dir, path, child = FALSE, ...) + +\S4method{stageObject}{logical}(x, dir, path, child = FALSE, ...) + +\S4method{stageObject}{double}(x, dir, path, child = FALSE, ...) + +\S4method{stageObject}{numeric}(x, dir, path, child = FALSE, ...) + +\S4method{stageObject}{Date}(x, dir, path, child = FALSE, ...) + +\S4method{stageObject}{POSIXlt}(x, dir, path, child = FALSE, ...) + +\S4method{stageObject}{POSIXct}(x, dir, path, child = FALSE, ...) +} +\arguments{ +\item{x}{Any of the atomic vector types, or \link{Date} objects, or time objects, e.g., \link{POSIXct}.} + +\item{dir}{String containing the path to the staging directory.} + +\item{path}{String containing a prefix of the relative path inside \code{dir} where \code{x} is to be saved. +The actual path used to save \code{x} may include additional components, see Details.} + +\item{child}{Logical scalar indicating whether \code{x} is a child of a larger object.} + +\item{...}{Further arguments that are ignored.} +} +\value{ +A named list containing the metadata for \code{x}. +\code{x} itself is written to a CSV file inside \code{path}. +} +\description{ +Stage vectors containing atomic elements (or values that can be cast as such, e.g., dates and times). +} +\details{ +Dates and POSIX times are cast to strings; +the type itself is recorded in the metadata. +} +\examples{ +tmp <- tempfile() +dir.create(tmp) +stageObject(LETTERS, tmp, path="foo") +stageObject(setNames(runif(26), letters), tmp, path="bar") + +list.files(tmp, recursive=TRUE) + +} +\author{ +Aaron Lun +} diff --git a/tests/testthat/test-vector.R b/tests/testthat/test-vector.R new file mode 100644 index 0000000..ca2773a --- /dev/null +++ b/tests/testthat/test-vector.R @@ -0,0 +1,56 @@ +# Test stageObject on simple vectors. +# library(testthat); library(alabaster.base); source("test-vector.R") + +test_that("vectors work correctly without names", { + tmp <- tempfile() + dir.create(tmp, recursive=TRUE) + + input <- LETTERS + meta <- stageObject(input, tmp, path="foo") + expect_identical(meta$atomic_vector$type, "string") + writeMetadata(meta, tmp) + expect_identical(loadAtomicVector(meta, tmp), input) + + vals <- runif(25) + meta <- stageObject(vals, tmp, path="bar") + expect_identical(meta$atomic_vector$type, "number") + writeMetadata(meta, tmp) + expect_equal(loadAtomicVector(meta, tmp), vals) + + vals <- as.integer(rpois(99, 10)) + meta <- stageObject(vals, tmp, path="whee") + expect_identical(meta$atomic_vector$type, "integer") + writeMetadata(meta, tmp) + expect_identical(loadAtomicVector(meta, tmp), vals) + + vals <- rbinom(1000, 1, 0.5) > 0 + meta <- stageObject(vals, tmp, path="stuff") + expect_identical(meta$atomic_vector$type, "boolean") + writeMetadata(meta, tmp) + expect_identical(loadAtomicVector(meta, tmp), vals) + + vals <- c(Sys.Date(), Sys.Date() + 100, Sys.Date() - 100) + meta <- stageObject(vals, tmp, path="blah") + expect_identical(meta$atomic_vector$type, "string") + expect_identical(meta$atomic_vector$format, "date") + writeMetadata(meta, tmp) + expect_identical(loadAtomicVector(meta, tmp), vals) + + vals <- c(Sys.time(), Sys.time() + 100, Sys.time() - 100) + meta <- stageObject(vals, tmp, path="gunk") + expect_identical(meta$atomic_vector$type, "string") + expect_identical(meta$atomic_vector$format, "date-time") + writeMetadata(meta, tmp) + expect_true(all(abs(loadAtomicVector(meta, tmp) - vals) < 1)) # sub-second resolution on the strings. +}) + +test_that("vectors work correctly with names", { + tmp <- tempfile() + dir.create(tmp, recursive=TRUE) + + vals <- setNames(runif(26), LETTERS) + meta <- stageObject(vals, tmp, path="bar") + expect_identical(meta$atomic_vector$type, "number") + writeMetadata(meta, tmp) + expect_equal(loadAtomicVector(meta, tmp), vals) +})