Skip to content

Commit

Permalink
Updated the NA handling policy for HDF5 data frames.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Oct 3, 2023
1 parent 4249d95 commit a58768a
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 12 deletions.
11 changes: 9 additions & 2 deletions R/loadDataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,15 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {
if (version_above_1 || is.character(current)) {
attr <- h5readAttributes(path, prefix(paste0("data/", i)))
replace.na <- attr[["missing-value-placeholder"]]
if (!is.null(replace.na) && !is.na(replace.na)) {
raw[[i]][current == replace.na] <- NA

if (is.null(replace.na) || (is.na(replace.na) && !is.nan(replace.na))) {
# No-op as there are no NAs or the placeholder is already R's NA.
} else if (is.nan(replace.na)) {
# In case we have an NaN as a placeholder for NA.
raw[[i]][is.nan(current)] <- NA
} else {
# Using which() to avoid problems with existing NAs.
raw[[i]][which(current == replace.na)] <- NA
}
}
}
Expand Down
9 changes: 1 addition & 8 deletions R/stageBaseList.R
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list
y[is.na(y)] <- placeholder
}
} else if (anyNA(y)) {
if (is.double(y)) {
# Avoid unnecessary NA checks if they are all actually NaNs.
if (sum(is.na(y)) > sum(is.nan(y))) {
placeholder <- NA_real_
}
} else {
placeholder <- as(NA, storage.mode(y))
}
placeholder <- .choose_numeric_missing_placeholder(y)
}
} else {
if (is.logical(y) && anyNA(y)) {
Expand Down
4 changes: 2 additions & 2 deletions R/stageDataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -250,11 +250,11 @@ setMethod("stageObject", "DataFrame", function(x, dir, path, child=FALSE, df.nam

missing.placeholder <- NULL
if (anyNA(current)) {
if (is.character(current) && anyNA(current)) {
if (is.character(current)) {
missing.placeholder <- chooseMissingStringPlaceholder(current)
current[is.na(current)] <- missing.placeholder
} else if (.version > 1) {
missing.placeholder <- as(NA, storage.mode(current))
missing.placeholder <- .choose_numeric_missing_placeholder(current)
}
}

Expand Down
12 changes: 12 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,15 @@
.cast_atomic <- function(x, type) {
as(x, as.character(.atomics[type]))
}

.choose_numeric_missing_placeholder <- function(x) {
if (is.double(x)) {
if (sum(is.na(x)) > sum(is.nan(x))) {
return(NA_real_)
} else {
return(NULL)
}
} else {
return(as(NA, storage.mode(x)))
}
}
13 changes: 13 additions & 0 deletions tests/testthat/test-DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,19 @@ test_that("handling of NAs works correctly", {
meta2 <- stageObject(df, tmp, path="WHEE.h5")
expect_match(meta2[["$schema"]], "hdf5_data_frame")

fpath <- file.path(tmp, meta2$path)
attrs <- rhdf5::h5readAttributes(fpath, "contents/data/0")
expect_identical(attrs[["missing-value-placeholder"]], "NA")
attrs <- rhdf5::h5readAttributes(fpath, "contents/data/1")
expect_identical(attrs[["missing-value-placeholder"]], "_NA")
attrs <- rhdf5::h5readAttributes(fpath, "contents/data/4")
expect_true(is.na(attrs[["missing-value-placeholder"]]))
attrs <- rhdf5::h5readAttributes(fpath, "contents/data/5")
place <- attrs[["missing-value-placeholder"]]
expect_true(is.na(place) && !is.nan(place))
attrs <- rhdf5::h5readAttributes(fpath, "contents/data/6")
expect_true(is.na(attrs[["missing-value-placeholder"]]))

round2 <- loadDataFrame(meta2, project=tmp)
expect_identical(df, round2)
})
Expand Down

0 comments on commit a58768a

Please sign in to comment.