Skip to content

Commit

Permalink
fix merra URLs, closes #23; download.file everywhere (#28)
Browse files Browse the repository at this point in the history
* tests and examples for geomarker functions
* implemented conditional patterns for MERRA URLs during certain months in certain years
* everything is download.file now (except basic auth for merra)
  • Loading branch information
cole-brokamp committed Feb 3, 2024
1 parent 731fca6 commit 514667a
Show file tree
Hide file tree
Showing 14 changed files with 110 additions and 75 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ jobs:

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
EARTHDATA_USERNAME: ${{ secrets.EARTHDATA_USERNAME }}
EARTHDATA_PASSWORD: ${{ secrets.EARTHDATA_PASSWORD }}
R_KEEP_PKG_SOURCE: yes

steps:
Expand Down
18 changes: 8 additions & 10 deletions R/aqs.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,8 @@ get_daily_aqs <- function(pollutant = c("pm25", "ozone", "no2"), year = "2021")
)[pollutant]
file_name <- glue::glue("daily_{pollutant_code}_{year}.zip")
on.exit(unlink(file_name))
# TODO change to httr or httr2
utils::download.file(
url = glue::glue("https://aqs.epa.gov/aqsweb/airdata/{file_name}"),
destfile = file_name,
quiet = TRUE
)
glue::glue("https://aqs.epa.gov/aqsweb/airdata/{file_name}") |>
utils::download.file(destfile = file_name, quiet = TRUE)
unzipped_file_name <- gsub(pattern = ".zip", ".csv", file_name, fixed = TRUE)
on.exit(unlink(unzipped_file_name), add = TRUE)
utils::unzip(file_name)
Expand All @@ -64,7 +60,9 @@ get_daily_aqs <- function(pollutant = c("pm25", "ozone", "no2"), year = "2021")
return(d_out)
}

utils::globalVariables(c("Sample Duration", "Observation Percent",
"State Code", "County Code", "Site Num",
"Latitude", "Longitude", "Arithmetic Mean", "Date Local",
"lon", "lat", "conc"))
utils::globalVariables(c(
"Sample Duration", "Observation Percent",
"State Code", "County Code", "Site Num",
"Latitude", "Longitude", "Arithmetic Mean", "Date Local",
"lon", "lat", "conc"
))
2 changes: 2 additions & 0 deletions R/helper.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
.onLoad <- function(...) {
dir.create(tools::R_user_dir("appc", "data"), recursive = TRUE, showWarnings=FALSE)
options(timeout = max(900, getOption("timeout")),
download.file.method = "libcurl")
}

#' Get the geography of the 2020 contiguous United States
Expand Down
21 changes: 10 additions & 11 deletions R/install_released_data.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#' Download pre-installed data from GitHub release
#'
#'
#' `install_smoke_pm_data()`, `install_merra_data()`, `install_traffic()`, and install_nei_point_data()`
#' all download geospatial data directly from the provider and then transform or subset data into
#' smaller files to be used by appc `get_*_data()` functions.
Expand All @@ -9,7 +9,7 @@
#' These functions are utilized automatically by the geomarker assessment functions,
#' but can be called without input data to install the geomarker data ahead of time,
#' if external internet access is not possible after input data is added.
#' *Note that some of the install functions require a system installation of `gdal`.*
#' *Note that some of the install functions require a system installation of `gdal`.*
#'
#' To turn *off* the default usage of downloading
#' pre-generated data and to instead install data
Expand All @@ -20,15 +20,14 @@
#' @keywords internal
install_released_data <- function(released_data_name, package_version = utils::packageVersion("appc")) {
dest_file <- fs::path(tools::R_user_dir("appc", "data"), released_data_name)
glue::glue("https://github.com", "geomarker-io",
"appc", "releases", "download",
"v{package_version}",
released_data_name,
.sep = "/"
) |>
httr2::request() |>
httr2::req_progress() |>
httr2::req_perform(path = dest_file)
dl_url <- glue::glue(
"https://github.com", "geomarker-io",
"appc", "releases", "download",
"v{package_version}",
released_data_name,
.sep = "/"
)
utils::download.file(dl_url, dest_file, quiet = FALSE)
}

#' install_source_preference()
Expand Down
79 changes: 43 additions & 36 deletions R/merra.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#' Get MERRA-2 aerosol diagnostics data
#'
#'
#' Total and component (Dust, OC, BC, SS, SO4) surface PM2.5 concentrations
#' from the MERRA-2 [M2T1NXAER v5.12.4](https://disc.gsfc.nasa.gov/datasets/M2T1NXAER_5.12.4/summary) product.
#' @details
Expand All @@ -23,11 +23,11 @@
#' `merra_ss`, `merra_so4`, `merra_pm25`) with one row per date in `dates`
#' @export
#' @examples
#' # d <- list(
#' # "8841b39a7c46e25f" = as.Date(c("2023-05-18", "2023-11-06")),
#' # "8841a45555555555" = as.Date(c("2023-06-22", "2023-08-15"))
#' # )
#' # get_merra_data(x = s2::as_s2_cell(names(d)), dates = d)
#' d <- list(
#' "8841b39a7c46e25f" = as.Date(c("2023-05-18", "2023-11-06")),
#' "8841a45555555555" = as.Date(c("2023-06-22", "2023-08-15"))
#' )
#' get_merra_data(x = s2::as_s2_cell(names(d)), dates = d)
get_merra_data <- function(x, dates) {
if (!inherits(x, "s2_cell")) stop("x must be a s2_cell vector", call. = FALSE)
d_merra <-
Expand All @@ -46,7 +46,7 @@ get_merra_data <- function(x, dates) {
dplyr::mutate(s2_geography = s2::s2_cell_to_lnglat(s2)) |>
stats::na.omit() # some s2 failed to convert to lnglat ?

x_closest_merra <-
x_closest_merra <-
x |>
s2::s2_cell_to_lnglat() |>
s2::s2_closest_feature(d_merra$s2_geography)
Expand All @@ -57,13 +57,13 @@ get_merra_data <- function(x, dates) {

out <-
purrr::map2(x_closest_merra$data, dates,
\(xx, dd) {
tibble::tibble(date = dd) |>
dplyr::left_join(xx, by = "date") |>
dplyr::select(-date)
},
.progress = "extracting closest merra data"
)
\(xx, dd) {
tibble::tibble(date = dd) |>
dplyr::left_join(xx, by = "date") |>
dplyr::select(-date)
},
.progress = "extracting closest merra data"
)
names(out) <- as.character(x)
return(out)
}
Expand All @@ -76,15 +76,20 @@ get_merra_data <- function(x, dates) {
install_merra_data <- function(merra_year = as.character(2016:2023)) {
merra_year <- rlang::arg_match(merra_year)
dest_file <- fs::path(tools::R_user_dir("appc", "data"),
paste0(c("merra", merra_year), collapse = "_"), ext = "parquet")
if (fs::file_exists(dest_file)) return(as.character(dest_file))
paste0(c("merra", merra_year), collapse = "_"),
ext = "parquet"
)
if (fs::file_exists(dest_file)) {
return(as.character(dest_file))
}
if (!install_source_preference()) {
install_released_data(released_data_name = glue::glue("merra_{merra_year}.parquet"))
return(as.character(dest_file))
}
date_seq <- seq(as.Date(paste(c(merra_year, "01", "01"), collapse = "-")),
as.Date(paste(c(merra_year, "12", "31"), collapse = "-")),
by = 1)
as.Date(paste(c(merra_year, "12", "31"), collapse = "-")),
by = 1
)
message(glue::glue("downloading and subsetting daily MERRA files for {merra_year}"))
# takes a long time, so cache intermediate daily downloads and extractions
merra_data <- mappp::mappp(date_seq, create_daily_merra_data, cache = TRUE, cache_name = "merra_cache")
Expand All @@ -98,7 +103,7 @@ install_merra_data <- function(merra_year = as.character(2016:2023)) {

#' `create_daily_merra_data` downloads and computes MERRA PM2.5 data for a single day
#' @return for `create_daily_merra_data()`, a tibble with columns for s2,
#' date, and concentrations of PM2.5 total, dust, oc, bc, ss, so4
#' date, and concentrations of PM2.5 total, dust, oc, bc, ss, so4
#' @export
#' @rdname get_merra_data
create_daily_merra_data <- function(merra_date) {
Expand All @@ -107,25 +112,25 @@ create_daily_merra_data <- function(merra_date) {
earthdata_secrets <- Sys.getenv(c("EARTHDATA_USERNAME", "EARTHDATA_PASSWORD"), unset = NA)
if (any(is.na(earthdata_secrets))) stop("EARTHDATA_USERNAME or EARTHDATA_PASSWORD environment variables are unset", call. = FALSE)
tf <- tempfile(fileext = ".nc4")
fs::path("https://goldsmr4.gesdisc.eosdis.nasa.gov/data/MERRA2",
"M2T1NXAER.5.12.4",
format(the_date, "%Y"),
format(the_date, "%m"),
paste0("MERRA2_400.tavg1_2d_aer_Nx.", format(the_date, "%Y%m%d")),
ext = "nc4"
) |>
req_url <-
fs::path("https://goldsmr4.gesdisc.eosdis.nasa.gov/data/MERRA2",
"M2T1NXAER.5.12.4",
format(the_date, "%Y"),
format(the_date, "%m"),
paste0("MERRA2_400.tavg1_2d_aer_Nx.", format(the_date, "%Y%m%d")),
ext = "nc4"
)
if ((format(the_date, "%Y") == "2020" & format(the_date, "%m") == "09") ||
(format(the_date, "%Y") == "2021" & format(the_date, "%m") %in% c("06", "07", "08", "09"))) {
req_url <- gsub("MERRA2_400.", "MERRA2_401.", req_url, fixed = TRUE)
}
req_url |>
httr2::request() |>
httr2::req_auth_basic(
username = earthdata_secrets["EARTHDATA_USERNAME"],
password = earthdata_secrets["EARTHDATA_PASSWORD"]
) |>
## httr2::req_progress() |>
httr2::req_retry(max_tries = 3) |>
## httr2::req_proxy("http://bmiproxyp.chmcres.cchmc.org",
## port = 80,
## username = Sys.getenv("CCHMC_USERNAME"),
## password = Sys.getenv("CCHMC_PASSWORD")
## ) |>
httr2::req_cache(tempdir()) |>
httr2::req_perform(path = tf)
out <-
tidync::tidync(tf) |>
Expand All @@ -150,6 +155,8 @@ create_daily_merra_data <- function(merra_date) {
return(out)
}

utils::globalVariables(c("DUSMASS25", "OCSMASS", "BCSMASS", "SSSMASS25",
"SO4SMASS", "merra_dust", "merra_oc", "merra_oc",
"merra_bc", "merra_ss", "merra_so4", "value"))
utils::globalVariables(c(
"DUSMASS25", "OCSMASS", "BCSMASS", "SSSMASS25",
"SO4SMASS", "merra_dust", "merra_oc", "merra_oc",
"merra_bc", "merra_ss", "merra_so4", "value"
))
2 changes: 1 addition & 1 deletion R/narr.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ install_narr_data <- function(narr_var = c("air.2m", "hpbl", "acpcp", "rhum.2m",
"Datasets", "NARR", "Dailies", "monolevel",
"{narr_var}.{narr_year}.nc",
.sep = "/"
) |> # TODO replace with httr2
) |>
utils::download.file(destfile = dest_file)
return(dest_file)
}
Expand Down
7 changes: 4 additions & 3 deletions R/nei.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@ install_nei_point_data <- function(year = c("2020", "2017")) {
}
message(glue::glue("downloading {year} NEI file"))
zip_path <- fs::path(tempdir(), glue::glue("nei_{year}.zip"))
dplyr::case_when(year == "2020" ~ "https://gaftp.epa.gov/air/nei/2020/data_summaries/Facility%20Level%20by%20Pollutant.zip",
year == "2017" ~ "https://gaftp.epa.gov/air/nei/2017/data_summaries/2017v1/2017neiJan_facility.zip") |>
httr::GET(httr::write_disk(zip_path), httr::progress(), overwrite = TRUE)
dl_url <-
dplyr::case_when(year == "2020" ~ "https://gaftp.epa.gov/air/nei/2020/data_summaries/Facility%20Level%20by%20Pollutant.zip",
year == "2017" ~ "https://gaftp.epa.gov/air/nei/2017/data_summaries/2017v1/2017neiJan_facility.zip")
utils::download.file(dl_url, dest_file, quiet = FALSE)
nei_raw_paths <- utils::unzip(zip_path, exdir = tempdir())
grep(".csv", nei_raw_paths, fixed = TRUE, value = TRUE) |>
readr::read_csv(col_types = readr::cols_only(
Expand Down
2 changes: 1 addition & 1 deletion R/nlcd.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ install_impervious <- function(year = as.character(c(2019, 2016, 2013, 2011, 200
message(glue::glue("downloading {year} NLCD impervious raster"))
nlcd_zip_path <- fs::path(tempdir(), glue::glue("nlcd_impervious_{year}.zip"))
glue::glue("https://s3-us-west-2.amazonaws.com/mrlc/nlcd_{year}_impervious_l48_20210604.zip") |>
httr::GET(httr::write_disk(nlcd_zip_path, overwrite = TRUE), httr::progress())
utils::download.file(nlcd_zip_path)
nlcd_raw_paths <- utils::unzip(nlcd_zip_path, exdir = tempdir())
message(glue::glue("converting {year} NLCD impervious raster"))
system2(
Expand Down
5 changes: 1 addition & 4 deletions R/smoke.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@ install_smoke_pm_data <- function() {
}
message("downloading and installing smoke PM data from source")
tf <- tempfile()
"https://www.dropbox.com/sh/atmtfc54zuknnob/AAA7AVRQP-GoIMHpxlvfN7RBa?dl=1" |>
httr2::request() |>
httr2::req_progress() |>
httr2::req_perform(path = tf)
utils::download.file("https://www.dropbox.com/sh/atmtfc54zuknnob/AAA7AVRQP-GoIMHpxlvfN7RBa?dl=1", tf)
d_smoke <-
unz(tf, grep(".csv", utils::unzip(tf, list = TRUE)$Name, value = TRUE)) |>
readr::read_csv(
Expand Down
4 changes: 1 addition & 3 deletions R/traffic.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,7 @@ install_traffic <- function() {
message("downloading and installing HPMS data from source")
dest_path <- tempfile(fileext = ".gdb.zip")
"https://www.arcgis.com/sharing/rest/content/items/c199f2799b724ffbacf4cafe3ee03e55/data" |>
httr2::request() |>
httr2::req_progress() |>
httr2::req_perform(path = dest_path)
utils::download.file(dest_path)
hpms_states <-
sf::st_layers(dsn = dest_path)$name |>
strsplit("_", fixed = TRUE) |>
Expand Down
7 changes: 6 additions & 1 deletion justfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,13 @@ upload_geo_data:
gh release upload v{{pkg_version}} "{{geomarker_folder}}/hpms_f123_aadt.rds"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/nei_2020.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/nei_2017.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2023.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2017.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2018.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2019.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2020.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2021.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2022.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2023.parquet"

# train grf model
train:
Expand Down
10 changes: 5 additions & 5 deletions man/get_merra_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions tests/testthat/test-merra-daily.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
if (file.exists(".env")) dotenv::load_dot_env()
earthdata_secrets <- Sys.getenv(c("EARTHDATA_USERNAME", "EARTHDATA_PASSWORD"), unset = NA)
skip_if(any(is.na(earthdata_secrets)), message = "no earthdata credentials found")

test_that("getting daily merra from GES DISC works", {
# "normal" pattern
# merra site uses 401 instead of 400 for september 2020 dates
# merra site uses 401 instead of 400 for jun, jul, aug, sep 2021 dates
out <-
list(
create_daily_merra_data("2023-05-23"),
create_daily_merra_data(merra_date = "2020-09-02"),
create_daily_merra_data(merra_date = "2021-06-16")
)
expect_equal(sapply(out, nrow), c(4800, 4800, 4800))
})
10 changes: 10 additions & 0 deletions tests/testthat/test-merra.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
test_that("get_merra_data works", {
d <- list(
"8841b39a7c46e25f" = as.Date(c("2023-05-18", "2023-11-06")),
"8841a45555555555" = as.Date(c("2023-06-22", "2023-08-15"))
)
out <- get_merra_data(x = s2::as_s2_cell(names(d)), dates = d)
expect_equal(length(out), 2)
expect_equal(lapply(out, nrow), list("8841b39a7c46e25f" = 2, "8841a45555555555" = 2))
expect_equal(out$`8841b39a7c46e25f`$merra_dust, c(1.77165780194481, 0.841950516250467))
})

0 comments on commit 514667a

Please sign in to comment.