Skip to content

Commit

Permalink
Support staging and loading of atomic vectors.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Sep 30, 2023
1 parent 8d50faf commit b107fff
Show file tree
Hide file tree
Showing 10 changed files with 362 additions and 32 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ export(createRedirection)
export(customloadObjectHelper)
export(listDirectory)
export(listLocalObjects)
export(loadAtomicVector)
export(loadBaseFactor)
export(loadBaseList)
export(loadDataFrame)
export(loadDataFrameFactor)
Expand Down
50 changes: 50 additions & 0 deletions R/loadAtomicVector.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#' Load an atomic vector
#'
#' Load a simple vector consisting of atomic elements from file.
#'
#' @inheritParams loadDataFrame
#' @param ... Further arguments, ignored.
#'
#' @return The vector described by \code{info}, possibly with names.
#'
#' @seealso
#' \code{"\link{stageObject,integer-method}"}, for one of the staging methods.
#'
#' @author Aaron Lun
#'
#' @examples
#' tmp <- tempfile()
#' dir.create(tmp)
#' meta <- stageObject(setNames(runif(26), letters), tmp, path="bar")
#' loadAtomicVector(meta, tmp)
#'
#' @export
loadAtomicVector <- function(info, project, ...) {
fpath <- acquireFile(project, info$path)
meta <- info$atomic_vector

df <- read.csv3(fpath, compression=meta$compression, nrows=meta$length)
output <- df[,ncol(df)]
type <- meta$type

if (type == "string") {
format <- meta$format
if (!is.null(format)) {
if (format == "date") {
output <- as.Date(output)
} else if (format == "date-time") {
output <- .cast_datetime(output)
} else {
output <- as.character(output)
}
}
} else {
stopifnot(.is_atomic(type))
output <- .cast_atomic(output, type)
}

if (isTRUE(meta$names)) {
names(output) <- df[,1]
}
output
}
7 changes: 3 additions & 4 deletions R/loadDataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {

# Make sure everyone is of the right type.
new.names <- character(ncol(df))
atomics <- c(integer="integer", number="double", string="character", boolean="logical")

for (i in seq_along(col.info)) {
new.names[i] <- col.info[[i]]$name
Expand All @@ -114,10 +113,10 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {

} else if (col.type=="date-time") {
# Remove colon in the timezone, which confuses as.POSIXct().
df[[i]] <- as.POSIXct(sub(":([0-9]{2})$", "\\1", df[[i]]), format="%Y-%m-%dT%H:%M:%S%z")
df[[i]] <- .cast_datetime(df[[i]])

} else if (col.type %in% names(atomics)) {
df[[i]] <- as(df[[i]], as.character(atomics[col.type]))
} else if (.is_atomic(col.type)) {
df[[i]] <- .cast_atomic(df[[i]], col.type)

} else if (col.type == "other") {
current <- acquireMetadata(project, col.info[[i]]$resource$path)
Expand Down
103 changes: 103 additions & 0 deletions R/stageAtomicVector.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#' @importFrom S4Vectors DataFrame
.stage_atomic_vector <- function(x, dir, path, child=FALSE, ...) {
dir.create(file.path(dir, path), showWarnings=FALSE)
new_path <- paste0(path, "/simple.txt.gz")

if (.is_datetime(x)) {
type <- "string"
format <- "date-time"
contents <- .sanitize_datetime(x)

} else if (is(x, "Date")) {
type <- "string"
format <- "date"
contents <- .sanitize_date(x)

} else {
stopifnot(is.atomic(x))
remapped <- .remap_atomic_type(x)
type <- remapped$type
contents <- remapped$values
format <- NULL
}

mock <- DataFrame(values=contents)
if (!is.null(names(x))) {
mock <- cbind(names=names(x), mock)
}
quickWriteCsv(mock, file.path(dir, new_path), row.names=FALSE, compression="gzip")

list(
`$schema` = "atomic_vector/v1.json",
path = new_path,
is_child = child,
atomic_vector = list(
type = type,
length = length(contents),
names = !is.null(names(x)),
format = format,
compression="gzip"
)
)
}

#' Stage atomic vectors
#'
#' Stage vectors containing atomic elements (or values that can be cast as such, e.g., dates and times).
#'
#' @param x Any of the atomic vector types, or \link{Date} objects, or time objects, e.g., \link{POSIXct}.
#' @inheritParams stageObject
#' @param ... Further arguments that are ignored.
#'
#' @return
#' A named list containing the metadata for \code{x}.
#' \code{x} itself is written to a CSV file inside \code{path}.
#'
#' @details
#' Dates and POSIX times are cast to strings;
#' the type itself is recorded in the metadata.
#'
#' @author Aaron Lun
#'
#' @examples
#' tmp <- tempfile()
#' dir.create(tmp)
#' stageObject(LETTERS, tmp, path="foo")
#' stageObject(setNames(runif(26), letters), tmp, path="bar")
#'
#' list.files(tmp, recursive=TRUE)
#'
#' @name stageAtomicVector
NULL

#' @export
#' @rdname stageAtomicVector
setMethod("stageObject", "integer", .stage_atomic_vector)

#' @export
#' @rdname stageAtomicVector
setMethod("stageObject", "character", .stage_atomic_vector)

#' @export
#' @rdname stageAtomicVector
setMethod("stageObject", "logical", .stage_atomic_vector)

#' @export
#' @rdname stageAtomicVector
setMethod("stageObject", "double", .stage_atomic_vector)

#' @export
#' @rdname stageAtomicVector
setMethod("stageObject", "numeric", .stage_atomic_vector)

#' @export
#' @rdname stageAtomicVector
setMethod("stageObject", "Date", .stage_atomic_vector)

#' @export
#' @rdname stageAtomicVector
setMethod("stageObject", "POSIXlt", .stage_atomic_vector)

#' @export
#' @rdname stageAtomicVector
setMethod("stageObject", "POSIXct", .stage_atomic_vector)
6 changes: 3 additions & 3 deletions R/stageBaseList.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list
return(NULL)

} else if (is.atomic(x)) {
coerced <- .remap_type(x)
coerced <- .remap_atomic_type(x)

.label_hdf5_group(fpath, name,
uzuki_object="vector",
Expand Down Expand Up @@ -191,7 +191,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list
.is_stringlike <- function(x) {
if (is(x, "Date")) {
return("date")
} else if (is(x, "POSIXct") || is(x, "POSIXlt")) {
} else if (.is_datetime(x)) {
return("date-time")
} else if (is.character(x)) {
return("string")
Expand Down Expand Up @@ -283,7 +283,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list
return(formatted)

} else if (is.atomic(x)) {
formatted <- .remap_type(x)
formatted <- .remap_atomic_type(x)

y <- formatted$values
if (is.numeric(y) && !all(is.finite(y))) {
Expand Down
28 changes: 3 additions & 25 deletions R/stageDataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,19 +112,16 @@ setMethod("stageObject", "DataFrame", function(x, dir, path, child=FALSE, df.nam

x[[z]] <- as.character(col)

} else if (is.character(col)) {
out$type <- "string"

} else if (is(col, "POSIXct") || is(col, "POSIXlt")) {
} else if (.is_datetime(col)) {
out$type <- "date-time"
x[[z]] <- .sanitize_datetime(col)

} else if (is(col, "Date")) {
out$type <- "date"
x[[z]] <- format(col, "%Y-%m-%d")
x[[z]] <- .sanitize_date(col)

} else if (is.atomic(col)) {
coerced <- .remap_type(col)
coerced <- .remap_atomic_type(col)
out$type <- coerced$type
x[[z]] <- coerced$values

Expand Down Expand Up @@ -208,25 +205,6 @@ setMethod("stageObject", "DataFrame", function(x, dir, path, child=FALSE, df.nam
meta
})

.sanitize_datetime <- function(x) {
sub("([0-9]{2})$", ":\\1", strftime(x, "%Y-%m-%dT%H:%M:%S%z"))
}

.remap_type <- function(x) {
y <- typeof(x)

# Forcibly coercing the types, just to make sure that
# we don't get tricked by classes that might do something
# different inside write.csv or whatever.
switch(y,
integer=list(type="integer", values=as.integer(x)),
double=list(type="number", values=as.double(x)),
numeric=list(type="number", values=as.double(x)),
logical=list(type="boolean", values=as.logical(x)),
stop("type '", y, "' is not supported")
)
}

#' @importFrom rhdf5 h5write h5createGroup h5createFile
.write_hdf5_data_frame <- function(x, skippable, host, ofile) {
h5createFile(ofile)
Expand Down
42 changes: 42 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
.sanitize_date <- function(x) {
format(x, "%Y-%m-%d")
}

.sanitize_datetime <- function(x) {
sub("([0-9]{2})$", ":\\1", strftime(x, "%Y-%m-%dT%H:%M:%S%z"))
}

.is_datetime <- function(x) {
is(x, "POSIXct") || is(x, "POSIXlt")
}

.remap_atomic_type <- function(x) {
y <- typeof(x)

# Forcibly coercing the types, just to make sure that
# we don't get tricked by classes that might do something
# different inside write.csv or whatever.
switch(y,
integer=list(type="integer", values=as.integer(x)),
double=list(type="number", values=as.double(x)),
numeric=list(type="number", values=as.double(x)),
logical=list(type="boolean", values=as.logical(x)),
character=list(type="string", values=as.character(x)),
stop("type '", y, "' is not supported")
)
}

.cast_datetime <- function(x) {
# Remove colon in the timezone, which confuses as.POSIXct().
as.POSIXct(sub(":([0-9]{2})$", "\\1", x), format="%Y-%m-%dT%H:%M:%S%z")
}

.atomics <- c(integer="integer", number="double", string="character", boolean="logical")

.is_atomic <- function(type) {
type %in% names(.atomics)
}

.cast_atomic <- function(x, type) {
as(x, as.character(.atomics[type]))
}
35 changes: 35 additions & 0 deletions man/loadAtomicVector.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit b107fff

Please sign in to comment.