Skip to content

Commit

Permalink
use rds to store package data files instead of parquet (#31)
Browse files Browse the repository at this point in the history
  • Loading branch information
cole-brokamp committed Feb 5, 2024
1 parent b5aef51 commit 41e0fea
Show file tree
Hide file tree
Showing 12 changed files with 92 additions and 43 deletions.
1 change: 0 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ Imports:
purrr,
tigris,
sf,
arrow,
terra,
fs,
httr,
Expand Down
10 changes: 6 additions & 4 deletions R/merra.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@
#' `merra_ss`, `merra_so4`, `merra_pm25`) with one row per date in `dates`
#' @export
#' @examples
#' \dontrun{
#' d <- list(
#' "8841b39a7c46e25f" = as.Date(c("2023-05-18", "2023-11-06")),
#' "8841a45555555555" = as.Date(c("2023-06-22", "2023-08-15"))
#' )
#' get_merra_data(x = s2::as_s2_cell(names(d)), dates = d)
#' }
get_merra_data <- function(x, dates) {
if (!inherits(x, "s2_cell")) stop("x must be a s2_cell vector", call. = FALSE)
d_merra <-
Expand All @@ -38,7 +40,7 @@ get_merra_data <- function(x, dates) {
format("%Y") |>
unique() |>
purrr::map_chr(\(.) install_merra_data(merra_year = .)) |>
purrr::map(arrow::read_parquet) |>
purrr::map(readRDS) |>
purrr::list_rbind() |>
dplyr::nest_by(s2) |>
dplyr::ungroup() |>
Expand Down Expand Up @@ -77,13 +79,13 @@ install_merra_data <- function(merra_year = as.character(2016:2023)) {
merra_year <- rlang::arg_match(merra_year)
dest_file <- fs::path(tools::R_user_dir("appc", "data"),
paste0(c("merra", merra_year), collapse = "_"),
ext = "parquet"
ext = "rds"
)
if (fs::file_exists(dest_file)) {
return(as.character(dest_file))
}
if (!install_source_preference()) {
install_released_data(released_data_name = glue::glue("merra_{merra_year}.parquet"))
install_released_data(released_data_name = glue::glue("merra_{merra_year}.rds"))
return(as.character(dest_file))
}
date_seq <- seq(as.Date(paste(c(merra_year, "01", "01"), collapse = "-")),
Expand All @@ -97,7 +99,7 @@ install_merra_data <- function(merra_year = as.character(2016:2023)) {
tibble::enframe(merra_data, name = "date") |>
dplyr::mutate(date = as.Date(date)) |>
tidyr::unnest(cols = c(value)) |>
arrow::write_parquet(dest_file)
saveRDS(dest_file)
return(as.character(dest_file))
}

Expand Down
15 changes: 8 additions & 7 deletions R/nei.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
#' @export
#' @examples
#' get_nei_point_summary(s2::as_s2_cell(c("8841b399ced97c47", "8841b38578834123")), year = "2020")
#' get_nei_point_summary(s2::as_s2_cell(c("8841b399ced97c47", "8841b38578834123")), year = "2017")
get_nei_point_summary <- function(x, year = c("2020", "2017"), pollutant_code = c("PM25-PRI", "EC", "OC", "SO4", "NO3", "PMFINE"), buffer = 1000) {
year <- rlang::arg_match(year)
if (!inherits(x, "s2_cell")) stop("x must be a s2_cell vector", call. = FALSE)
nei_data <- arrow::read_parquet(install_nei_point_data(year = year))
nei_data <- readRDS(install_nei_point_data(year = year))
pollutant_code <- rlang::arg_match(pollutant_code)
message("intersecting ", year, " ", pollutant_code, " NEI point sources within ", buffer, " meters")
## message("intersecting ", year, " ", pollutant_code, " NEI point sources within ", buffer, " meters")
withins <- s2::s2_dwithin_matrix(s2::s2_cell_to_lnglat(x), s2::s2_cell_to_lnglat(nei_data$s2), distance = buffer)
summarize_emissions <- function(i) {
nei_data[withins[[i]], ] |>
Expand All @@ -34,7 +35,7 @@ get_nei_point_summary <- function(x, year = c("2020", "2017"), pollutant_code =
}

#' Installs NEI point data into user's data directory for the `appc` package
#' @return for `get_nei_point_data()`, a character string path to NEI point data parquet file
#' @return for `get_nei_point_data()`, a character string path to NEI point data RDS file
#' @details The NEI file is downloaded, unzipped, and filtered to observations
#' with a pollutant code of `EC`, `OC`, `SO4`, `NO3`, `PMFINE`, or `PM25-PRI`.
#' Latitude and longitude are encoded as an s2 vector, column names are cleaned,
Expand All @@ -43,18 +44,18 @@ get_nei_point_summary <- function(x, year = c("2020", "2017"), pollutant_code =
#' @export
install_nei_point_data <- function(year = c("2020", "2017")) {
year <- rlang::arg_match(year)
dest_file <- fs::path(tools::R_user_dir("appc", "data"), glue::glue("nei_{year}.parquet"))
dest_file <- fs::path(tools::R_user_dir("appc", "data"), glue::glue("nei_{year}.rds"))
if (file.exists(dest_file)) return(dest_file)
if (!install_source_preference()) {
install_released_data(released_data_name = glue::glue("nei_{year}.parquet"))
install_released_data(released_data_name = glue::glue("nei_{year}.rds"))
return(as.character(dest_file))
}
message(glue::glue("downloading {year} NEI file"))
zip_path <- fs::path(tempdir(), glue::glue("nei_{year}.zip"))
dl_url <-
dplyr::case_when(year == "2020" ~ "https://gaftp.epa.gov/air/nei/2020/data_summaries/Facility%20Level%20by%20Pollutant.zip",
year == "2017" ~ "https://gaftp.epa.gov/air/nei/2017/data_summaries/2017v1/2017neiJan_facility.zip")
utils::download.file(dl_url, dest_file, quiet = FALSE)
utils::download.file(dl_url, zip_path, quiet = FALSE)
nei_raw_paths <- utils::unzip(zip_path, exdir = tempdir())
grep(".csv", nei_raw_paths, fixed = TRUE, value = TRUE) |>
readr::read_csv(col_types = readr::cols_only(
Expand All @@ -70,7 +71,7 @@ install_nei_point_data <- function(year = c("2020", "2017")) {
dplyr::select(-`site latitude`, -`site longitude`) |>
dplyr::rename_with(~ tolower(gsub(" ", "_", .x, fixed = TRUE))) |>
stats::na.omit() |>
arrow::write_parquet(dest_file)
saveRDS(dest_file)
return(as.character(dest_file))
}

Expand Down
12 changes: 6 additions & 6 deletions R/smoke.R
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
#' Installs daily, census-tract level smoke pm data into user's data directory for the `appc` package
#'
#' See the examples to read the installed parquet file.
#' See the examples to read the installed RDS file.
#' Merge this data with existing data on `date` and `census_tract_id_2010` to retrieve the
#' `smoke_pm` column. Note that any census tract-date combination implicitly missing has a value of zero.
#' @references <https://pubmed.ncbi.nlm.nih.gov/36134580/>
#' @references <https://github.com/echolab-stanford/daily-10km-smokePM>
#' @return path to parquet file containing smoke data
#' @return path to RDS file containing smoke data
#' @export
#' @examples
#' \dontrun{
#' arrow::read_parquet(install_smoke_pm_data())
#' readRDS(install_smoke_pm_data())
#' }
install_smoke_pm_data <- function() {
dest_file <- fs::path(tools::R_user_dir("appc", "data"), "smoke.parquet")
dest_file <- fs::path(tools::R_user_dir("appc", "data"), "smoke.rds")
if (file.exists(dest_file)) {
return(as.character(dest_file))
}
if (!install_source_preference()) {
install_released_data(released_data_name = "smoke.parquet")
install_released_data(released_data_name = "smoke.rds")
return(as.character(dest_file))
}
message("downloading and installing smoke PM data from source")
Expand All @@ -37,7 +37,7 @@ install_smoke_pm_data <- function() {
smoke_pm = smokePM_pred
) |>
dplyr::filter(date > as.Date("2015-12-31"))
arrow::write_parquet(d_smoke, dest_file)
saveRDS(d_smoke, dest_file)
return(as.character(dest_file))
}

Expand Down
3 changes: 2 additions & 1 deletion inst/make_training_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ if(!require(appc)) devtools::load_all()
## # download any geomarker data ahead of time, if not already cached
## c(
## install_elevation_data(),
## install_traffic(),
## tidyr::expand_grid(narr_var = c("air.2m", "hpbl", "acpcp", "rhum.2m", "vis", "pres.sfc", "uwnd.10m", "vwnd.10m"),
## narr_year = as.character(2017:2023)) |>
## purrr::pmap_chr(install_narr_data),
Expand Down Expand Up @@ -121,7 +122,7 @@ d <-

# smoke
d$census_tract_id_2010 <- get_census_tract_id(d$s2, year = 2010)
d_smoke <- arrow::read_parquet(install_smoke_pm_data())
d_smoke <- readRDS(install_smoke_pm_data())
d <-
d |>
left_join(d_smoke, by = c("census_tract_id_2010", "date")) |>
Expand Down
72 changes: 53 additions & 19 deletions justfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# set shell := ["R", "-e"]
set dotenv-load
pkg_version := `Rscript -e "cat(desc::desc_get('Version'))"`
geomarker_folder := `Rscript -e "cat(tools::R_user_dir('appc', 'data'))"`

# document R package
document:
Expand All @@ -22,30 +23,63 @@ build_site: document
make_training_data:
Rscript inst/make_training_data.R

# upload grf model to current github release
upload_grf:
gh release upload v{{pkg_version}} "inst/rf_pm.rds"

geomarker_folder := `Rscript -e "cat(tools::R_user_dir('appc', 'data'))"`
# upload precomputed geomarker data to current github release
upload_geo_data:
gh release upload v{{pkg_version}} "{{geomarker_folder}}/smoke.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/hpms_f123_aadt.rds"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/nei_2020.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/nei_2017.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2017.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2018.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2019.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2020.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2021.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2022.parquet"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2023.parquet"

# train grf model
train:
Rscript inst/train_model.R

# upload grf model to current github release
upload_grf:
gh release upload v{{pkg_version}} "inst/rf_pm.rds"

# create CV accuracy report
report:
R -e "rmarkdown::render('./inst/APPC_prediction_evaluation.Rmd')"
open inst/APPC_prediction_evaluation.html

# install nei data from source and upload to github release
release_nei_data:
rm -f "{{geomarker_folder}}/nei_2017.rds"
R --quiet -e \
"devtools::load_all(); \
options('appc_install_data_from_source' = TRUE); \
install_nei_point_data('2017')"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/nei_2017.rds"
rm -f "{{geomarker_folder}}/nei_2020.rds"
R --quiet -e \
"devtools::load_all(); \
options('appc_install_data_from_source' = TRUE); \
install_nei_point_data('2020')"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/nei_2020.rds"

# install smoke data from source and upload to github release
release_smoke_data:
rm -f "{{geomarker_folder}}/smoke.rds"
R --quiet -e \
"devtools::load_all(); \
options('appc_install_data_from_source' = TRUE); \
install_smoke_pm_data()"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/smoke.rds"

# install traffic data from source and upload to github release
release_traffic_data:
rm -f "{{geomarker_folder}}/hpms_f123_aadt.rds"
R --quiet -e \
"devtools::load_all(); \
options('appc_install_data_from_source' = TRUE); \
options('timeout' = 3000); \
install_traffic()"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/hpms_f123_aadt.rds"

# install merra data from source and upload to github release
release_merra_data:
export APPC_INSTALL_DATA_FROM_SOURCE=TRUE
rm "{{geomarker_folder}}/merra_2017.rds"
R -f -e "install_merra_data('2017')"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2017.rds"
rm "{{geomarker_folder}}/merra_2018.rds"
R -f -e "install_merra_data('2018')"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2018.rds"
rm "{{geomarker_folder}}/merra_2017.rds"
R -f -e "install_merra_data('2017')"
gh release upload v{{pkg_version}} "{{geomarker_folder}}/merra_2017.rds"

2 changes: 2 additions & 0 deletions man/get_merra_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/get_nei_point_summary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/install_smoke_pm_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions tests/testthat/test-merra-daily.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
if (file.exists(".env")) dotenv::load_dot_env()
earthdata_secrets <- Sys.getenv(c("EARTHDATA_USERNAME", "EARTHDATA_PASSWORD"), unset = NA)
skip_if(any(is.na(earthdata_secrets)), message = "no earthdata credentials found")
skip()

test_that("getting daily merra from GES DISC works", {
# "normal" pattern
Expand Down
2 changes: 2 additions & 0 deletions tests/testthat/test-merra.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
skip()

test_that("get_merra_data works", {
d <- list(
"8841b39a7c46e25f" = as.Date(c("2023-05-18", "2023-11-06")),
Expand Down
8 changes: 7 additions & 1 deletion tests/testthat/test-nei.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
test_that("get_nei_point_summary works", {
get_nei_point_summary(s2::as_s2_cell(c("8841b399ced97c47", "8841b38578834123")), year = "2020") |>
get_nei_point_summary(s2::as_s2_cell(c("8841b399ced97c47", "8841b38578834123")),
year = "2020",
pollutant_code = "PM25-PRI", buffer = 1000) |>
expect_equal(c(6.30285480860561e-05, 1.77088909030131e-05))
get_nei_point_summary(s2::as_s2_cell(c("8841b399ced97c47", "8841b38578834123")),
year = "2017",
pollutant_code = "PMFINE", buffer = 1500) |>
expect_equal(c(7.45512086860517e-05, 2.20420317344646e-05))
})

0 comments on commit 41e0fea

Please sign in to comment.