Skip to content

Commit

Permalink
fix incorrect proxy id.outcome
Browse files Browse the repository at this point in the history
  • Loading branch information
matt committed Jun 5, 2024
1 parent 38f9731 commit 11e6407
Showing 1 changed file with 129 additions and 77 deletions.
206 changes: 129 additions & 77 deletions R/TwoSampleMR_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -713,17 +713,17 @@ proxy_search <- function(data_exposure, data_outcome, data_outcome_path, data_re
outcome_sep, outcome_phenotype, outcome_SNP, outcome_BETA, outcome_SE, outcome_P,
outcome_EA, outcome_OA, outcome_EAF, outcome_N, outcome_ID, outcome_CHR, outcome_POS) {

# Parameter Validation
# parameter Validation ====
if (!file.exists(data_outcome_path) || !file.exists(paste0(data_reference))) {
stop("Invalid file paths provided.")
}

if (!is.data.frame(data_exposure) || !is.data.frame(data_outcome)) {
stop("data_exposure and data_outcome must be data frames.")
}

# exposure snps missing from outcome ====
snps_missing <- setdiff(unique(as.factor(data_exposure$SNP)), unique(as.factor(data_outcome$SNP)))
# identify missing SNPs in the outcome data ====
snps_missing <- setdiff(unique(as.factor(data_exposure$SNP)),
unique(as.factor(data_outcome$SNP)))
if (length(snps_missing) == 0) {
message("No missing SNPs. Returning the original dataframe.")
return(data_outcome)
Expand Down Expand Up @@ -760,8 +760,7 @@ proxy_search <- function(data_exposure, data_outcome, data_outcome_path, data_re
proxy_a1.outcome = B1,
proxy_a2.outcome = B2,
R) %>%
dplyr::mutate(proxy.outcome = TRUE,
SNP = proxy_snp.outcome) %>%
dplyr::mutate(proxy.outcome = TRUE) %>%
dplyr::select(proxy.outcome, everything())
message(paste0("## proxy-SNP(s) for ", length(unique(proxies$target_snp.outcome)), " missing-SNP(s) found; ", "proxy-SNP(s) for ", length(unique(as.factor(snps_reference))) - length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) not available (e.g., no proxy-SNP or r2 < provided)"))

Expand All @@ -785,7 +784,11 @@ proxy_search <- function(data_exposure, data_outcome, data_outcome_path, data_re
id_col = outcome_ID,
chr_col = outcome_CHR,
pos_col = outcome_POS)
data_outcome_proxies <- dplyr::left_join(data_outcome_proxies, proxies, by = c("SNP" = "SNP"))
data_outcome_proxies <- data_outcome_proxies %>%
dplyr::left_join(proxies, by = c("SNP" = "proxy_snp.outcome")) %>%
dplyr::mutate(proxy_snp.outcome = SNP, # add proxy_snp.outcome back in (SNP is the proxies found in the outcome)
SNP = target_snp.outcome, # make SNP the original (i.e., missing) SNP
id.outcome = unique(data_outcome[["id.outcome"]])) # reassign the id.outcome string from the original dataframe
message(paste0("## proxy-SNP(s) for ", length(unique(as.factor(data_outcome_proxies$target_snp.outcome))), " of ", length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) extracted"))

# select proxy-SNP(s) with the highest R2 ====
Expand Down Expand Up @@ -940,7 +943,6 @@ proxy_search_split <- function(data_exposure, data_outcome, data_outcome_path, d
return(data_outcome)
}


#' finds proxy SNPs from an internal reference (DECODE format)
#'
#' @param data_exposure Data frame containing exposure SNP data.
Expand All @@ -954,65 +956,88 @@ proxy_search_split <- function(data_exposure, data_outcome, data_outcome_path, d
#' @param outcome_sep Delimiter used in the outcome data file.
#' @return Data frame with proxy SNPs added.
#' @export
proxy_search_DECODE <- function (data_exposure, data_outcome, data_outcome_path, data_reference,
data_reference_path, tag_r2 = 0.8, tag_kb = 5000, tag_nsnp = 5000,
outcome_sep)
{
proxy_search_DECODE <- function(data_exposure, data_outcome, data_outcome_path, data_reference,
data_reference_path, tag_r2 = 0.8, tag_kb = 5000, tag_nsnp = 5000,
outcome_sep) {

# parameter Validation ====
if (!file.exists(data_outcome_path) || !file.exists(paste0(data_reference))) {
stop("Invalid file paths provided.")
}
if (!is.data.frame(data_exposure) || !is.data.frame(data_outcome)) {
stop("data_exposure and data_outcome must be data frames.")
}

# identify missing SNPs in the outcome data ====
snps_missing <- setdiff(unique(as.factor(data_exposure$SNP)),
unique(as.factor(data_outcome$SNP)))
if (length(snps_missing) == 0) {
message("No missing SNPs. Returning the original dataframe.")
return(data_outcome)
}
message(paste0("# 1. looking up ", length(unique(as.factor(snps_missing))),
" missing-SNP(s) in the reference panel"))
reference_header <- readLines(data_reference, n = 1)
column_index <- grep("rs", strsplit(reference_header, "\t")[[1]])
cmd <- sprintf("awk '$%d ~ /^(%s)$/' %s", column_index,
paste(snps_missing, collapse = "|"), paste0(data_reference))

# look-up missing SNPs in reference panel ====
message(paste0("# 1. looking up ", length(unique(as.factor(snps_missing))), " missing-SNP(s) in the reference panel"))
reference_header <- readLines(data_reference, n = 1) ## read the first row to identify the column containing "rs*"
column_index <- grep("rs", strsplit(reference_header, "\t")[[1]]) ## get column index
## bash command to filter rows based on the identified column
cmd <- sprintf("awk '$%d ~ /^(%s)$/' %s", column_index, paste(snps_missing, collapse = "|"), paste0(data_reference))
## use system to run the bash command and then read the result with fread
reference <- data.table::fread(cmd = cmd, quote = "")
snps_reference <- intersect(unique(as.factor(snps_missing)),
unique(as.factor(reference$V2)))
message(paste0("## ", length(unique(as.factor(snps_reference))),
" of ", length(unique(as.factor(snps_missing))), " missing-SNP(s) available in the reference panel"))
message(paste0("# 2. extracting proxy-SNP(s) for the ",
length(unique(as.factor(snps_reference))), " missing-SNP(s) from the reference panel"))
snps_reference <- intersect(unique(as.factor(snps_missing)), unique(as.factor(reference$V2)))
message(paste0("## ", length(unique(as.factor(snps_reference))), " of ", length(unique(as.factor(snps_missing))), " missing-SNP(s) available in the reference panel"))

# find proxies for available SNPs ====
message(paste0("# 2. extracting proxy-SNP(s) for the ", length(unique(as.factor(snps_reference))), " missing-SNP(s) from the reference panel"))
gwasvcf::set_plink(genetics.binaRies::get_plink_binary())
proxies <- functions::get_ld_proxies(rsid = snps_missing,
bfile = data_reference_path, searchspace = NULL, tag_kb = tag_kb,
tag_nsnp = tag_nsnp, tag_r2 = tag_r2, threads = 1, out = tempfile())
proxies <- proxies %>% dplyr::select(target_snp.outcome = SNP_A,
proxy_snp.outcome = SNP_B, target_a1.outcome = A1, target_a2.outcome = A2,
proxy_a1.outcome = B1, proxy_a2.outcome = B2, R) %>%
dplyr::mutate(proxy.outcome = TRUE, SNP = proxy_snp.outcome) %>%
bfile = data_reference_path,
searchspace = NULL,
tag_kb = tag_kb,
tag_nsnp = tag_nsnp,
tag_r2 = tag_r2,
threads = 1,
out = tempfile())
## format proxy data: change column order and names, add proxy.outcome = TRUE
proxies <- proxies %>%
dplyr::select(target_snp.outcome = SNP_A,
proxy_snp.outcome = SNP_B,
target_a1.outcome = A1,
target_a2.outcome = A2,
proxy_a1.outcome = B1,
proxy_a2.outcome = B2,
R) %>%
dplyr::mutate(proxy.outcome = TRUE) %>%
dplyr::select(proxy.outcome, everything())
message(paste0("## proxy-SNP(s) for ", length(unique(proxies$target_snp.outcome)),
" missing-SNP(s) found; ", "proxy-SNP(s) for ", length(unique(as.factor(snps_reference))) -
length(unique(as.factor(proxies$target_snp.outcome))),
" missing-SNP(s) not available (e.g., no proxy-SNP or r2 < provided)"))
message(paste0("## proxy-SNP(s) for ", length(unique(proxies$target_snp.outcome)), " missing-SNP(s) found; ", "proxy-SNP(s) for ", length(unique(as.factor(snps_reference))) - length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) not available (e.g., no proxy-SNP or r2 < provided)"))

# extract proxies from outcome ====
message(paste0("# 3. extracting proxy-SNP(s) from outcome"))
proxy_snps <- proxies %>% dplyr::distinct(proxy_snp.outcome) %>%
proxy_snps <- proxies %>%
dplyr::distinct(proxy_snp.outcome) %>%
dplyr::pull(proxy_snp.outcome)
data_outcome_proxies <- fread(data_outcome_path,
header = FALSE, sep = "\t",
col.names = c("chr.outcome", "pos.outcome", "SNPID", "SNP", "A1", "A2",
"beta.outcome", "pval.outcome", "min_log10_pval", "se.outcome",
"samplesize.outcome", "ImpMAF", "outcome", "effect_allele.outcome", "other_allele.outcome", "eaf.outcome"))

data_outcome_proxies <- data_outcome_proxies %>%
dplyr::filter(SNP %in% proxy_snps)
data_outcome_proxies <- dplyr::left_join(data_outcome_proxies,
proxies, by = c(SNP = "SNP"), relationship = "many-to-many")
message(paste0("## proxy-SNP(s) for ", length(unique(as.factor(data_outcome_proxies$target_snp.outcome))),
" of ", length(unique(as.factor(proxies$target_snp.outcome))),
" missing-SNP(s) extracted"))
data_outcome_proxies <- data_outcome_proxies %>% dplyr::group_by(target_snp.outcome) %>%
dplyr::filter(R == max(R)) %>% dplyr::slice(1) %>% dplyr::select(-R)
# dplyr::filter(SNP %in% proxy_snps) %>%
dplyr::left_join(proxies, by = c("SNP" = "proxy_snp.outcome")) %>%
dplyr::mutate(proxy_snp.outcome = SNP, # add proxy_snp.outcome back in (SNP is the proxies found in the outcome)
SNP = target_snp.outcome, # make SNP the original (i.e., missing) SNP
id.outcome = unique(data_outcome[["id.outcome"]])) # reassign the id.outcome string from the original dataframe
message(paste0("## proxy-SNP(s) for ", length(unique(as.factor(data_outcome_proxies$target_snp.outcome))), " of ", length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) extracted"))

# select proxy-SNP(s) with the highest R2 ====
data_outcome_proxies <- data_outcome_proxies %>%
dplyr::group_by(target_snp.outcome) %>%
dplyr::filter(R == max(R)) %>%
dplyr::slice(1) %>%
dplyr::select(-R)

# format data and bind with original outcome data ====
data_outcome_proxies <- format_data2(
data_outcome_proxies,
type = "outcome",
Expand All @@ -1033,10 +1058,13 @@ proxy_search_DECODE <- function (data_exposure, data_outcome, data_outcome_path,
min_pval = 1e-200,
log_pval = FALSE
)

data_outcome <- dplyr::bind_rows(data_outcome, data_outcome_proxies)

return(data_outcome)
}


#' finds proxy SNPs from an internal reference (UKB format)
#'
#' @param data_exposure Data frame containing exposure SNP data.
Expand All @@ -1050,50 +1078,65 @@ proxy_search_DECODE <- function (data_exposure, data_outcome, data_outcome_path,
#' @param outcome_sep Delimiter used in the outcome data file.
#' @return Data frame with proxy SNPs added.
#' @export
proxy_search_UKB <- function (data_exposure, data_outcome, data_outcome_path, data_reference,
data_reference_path, tag_r2 = 0.8, tag_kb = 5000, tag_nsnp = 5000,
outcome_sep)
{
proxy_search_UKB <- function(data_exposure, data_outcome, data_outcome_path, data_reference,
data_reference_path, tag_r2 = 0.8, tag_kb = 5000, tag_nsnp = 5000,
outcome_sep) {

# parameter Validation ====
if (!file.exists(data_outcome_path) || !file.exists(paste0(data_reference))) {
stop("Invalid file paths provided.")
}
if (!is.data.frame(data_exposure) || !is.data.frame(data_outcome)) {
stop("data_exposure and data_outcome must be data frames.")
}

# identify missing SNPs in the outcome data ====
snps_missing <- setdiff(unique(as.factor(data_exposure$SNP)),
unique(as.factor(data_outcome$SNP)))
if (length(snps_missing) == 0) {
message("No missing SNPs. Returning the original dataframe.")
return(data_outcome)
}
message(paste0("# 1. looking up ", length(unique(as.factor(snps_missing))),
" missing-SNP(s) in the reference panel"))
reference_header <- readLines(data_reference, n = 1)
column_index <- grep("rs", strsplit(reference_header, "\t")[[1]])
cmd <- sprintf("awk '$%d ~ /^(%s)$/' %s", column_index,
paste(snps_missing, collapse = "|"), paste0(data_reference))

# look-up missing SNPs in reference panel ====
message(paste0("# 1. looking up ", length(unique(as.factor(snps_missing))), " missing-SNP(s) in the reference panel"))
reference_header <- readLines(data_reference, n = 1) ## read the first row to identify the column containing "rs*"
column_index <- grep("rs", strsplit(reference_header, "\t")[[1]]) ## get column index
## bash command to filter rows based on the identified column
cmd <- sprintf("awk '$%d ~ /^(%s)$/' %s", column_index, paste(snps_missing, collapse = "|"), paste0(data_reference))
## use system to run the bash command and then read the result with fread
reference <- data.table::fread(cmd = cmd, quote = "")
snps_reference <- intersect(unique(as.factor(snps_missing)),
unique(as.factor(reference$V2)))
message(paste0("## ", length(unique(as.factor(snps_reference))),
" of ", length(unique(as.factor(snps_missing))), " missing-SNP(s) available in the reference panel"))
message(paste0("# 2. extracting proxy-SNP(s) for the ",
length(unique(as.factor(snps_reference))), " missing-SNP(s) from the reference panel"))
snps_reference <- intersect(unique(as.factor(snps_missing)), unique(as.factor(reference$V2)))
message(paste0("## ", length(unique(as.factor(snps_reference))), " of ", length(unique(as.factor(snps_missing))), " missing-SNP(s) available in the reference panel"))

# find proxies for available SNPs ====
message(paste0("# 2. extracting proxy-SNP(s) for the ", length(unique(as.factor(snps_reference))), " missing-SNP(s) from the reference panel"))
gwasvcf::set_plink(genetics.binaRies::get_plink_binary())
proxies <- functions::get_ld_proxies(rsid = snps_missing,
bfile = data_reference_path, searchspace = NULL, tag_kb = tag_kb,
tag_nsnp = tag_nsnp, tag_r2 = tag_r2, threads = 1, out = tempfile())
proxies <- proxies %>% dplyr::select(target_snp.outcome = SNP_A,
proxy_snp.outcome = SNP_B, target_a1.outcome = A1, target_a2.outcome = A2,
proxy_a1.outcome = B1, proxy_a2.outcome = B2, R) %>%
dplyr::mutate(proxy.outcome = TRUE, SNP = proxy_snp.outcome) %>%
bfile = data_reference_path,
searchspace = NULL,
tag_kb = tag_kb,
tag_nsnp = tag_nsnp,
tag_r2 = tag_r2,
threads = 1,
out = tempfile())
## format proxy data: change column order and names, add proxy.outcome = TRUE
proxies <- proxies %>%
dplyr::select(target_snp.outcome = SNP_A,
proxy_snp.outcome = SNP_B,
target_a1.outcome = A1,
target_a2.outcome = A2,
proxy_a1.outcome = B1,
proxy_a2.outcome = B2,
R) %>%
dplyr::mutate(proxy.outcome = TRUE) %>%
dplyr::select(proxy.outcome, everything())
message(paste0("## proxy-SNP(s) for ", length(unique(proxies$target_snp.outcome)),
" missing-SNP(s) found; ", "proxy-SNP(s) for ", length(unique(as.factor(snps_reference))) -
length(unique(as.factor(proxies$target_snp.outcome))),
" missing-SNP(s) not available (e.g., no proxy-SNP or r2 < provided)"))
message(paste0("## proxy-SNP(s) for ", length(unique(proxies$target_snp.outcome)), " missing-SNP(s) found; ", "proxy-SNP(s) for ", length(unique(as.factor(snps_reference))) - length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) not available (e.g., no proxy-SNP or r2 < provided)"))

# extract proxies from outcome ====
message(paste0("# 3. extracting proxy-SNP(s) from outcome"))
proxy_snps <- proxies %>% dplyr::distinct(proxy_snp.outcome) %>%
proxy_snps <- proxies %>%
dplyr::distinct(proxy_snp.outcome) %>%
dplyr::pull(proxy_snp.outcome)
data_outcome_proxies <- fread(data_outcome_path,
header = FALSE,
Expand All @@ -1102,14 +1145,22 @@ proxy_search_UKB <- function (data_exposure, data_outcome, data_outcome_path, da
dplyr::filter(SNP %in% proxy_snps)
data_outcome_proxies <- separate(data_outcome_proxies, ID, into = c("chr.outcome", "pos.outcome", "ID", "other_allele.outcome", "effect_allele.outcome", "eaf.outcome",
"INFO", "samplesize.outcome", "TEST", "beta.outcome", "se.outcome", "CHISQ", "LOG10P", "EXTRA", "outcome", "ID2"), sep = " ")
data_outcome_proxies <- dplyr::left_join(data_outcome_proxies,
proxies, by = c(SNP = "SNP"), relationship = "many-to-many")
message(paste0("## proxy-SNP(s) for ", length(unique(as.factor(data_outcome_proxies$target_snp.outcome))),
" of ", length(unique(as.factor(proxies$target_snp.outcome))),
" missing-SNP(s) extracted"))
data_outcome_proxies <- data_outcome_proxies %>% dplyr::group_by(target_snp.outcome) %>%
dplyr::filter(R == max(R)) %>% dplyr::slice(1) %>% dplyr::select(-R)
data_outcome_proxies <- data_outcome_proxies %>%
# dplyr::filter(SNP %in% proxy_snps) %>%
dplyr::left_join(proxies, by = c("SNP" = "proxy_snp.outcome")) %>%
dplyr::mutate(proxy_snp.outcome = SNP, # add proxy_snp.outcome back in (SNP is the proxies found in the outcome)
SNP = target_snp.outcome, # make SNP the original (i.e., missing) SNP
id.outcome = unique(data_outcome[["id.outcome"]])) # reassign the id.outcome string from the original dataframe
message(paste0("## proxy-SNP(s) for ", length(unique(as.factor(data_outcome_proxies$target_snp.outcome))), " of ", length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) extracted"))

# select proxy-SNP(s) with the highest R2 ====
data_outcome_proxies <- data_outcome_proxies %>%
dplyr::group_by(target_snp.outcome) %>%
dplyr::filter(R == max(R)) %>%
dplyr::slice(1) %>%
dplyr::select(-R)

# format data and bind with original outcome data ====
data_outcome_proxies <- format_data2(
data_outcome_proxies,
type = "outcome",
Expand All @@ -1132,6 +1183,7 @@ proxy_search_UKB <- function (data_exposure, data_outcome, data_outcome_path, da
)

data_outcome <- dplyr::bind_rows(data_outcome, data_outcome_proxies)

return(data_outcome)
}

Expand Down

0 comments on commit 11e6407

Please sign in to comment.