From 11e640765e84445b4a3f34b4e0df1b48388f6373 Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 5 Jun 2024 12:58:27 +0200 Subject: [PATCH] fix incorrect proxy id.outcome --- R/TwoSampleMR_functions.R | 206 ++++++++++++++++++++++++-------------- 1 file changed, 129 insertions(+), 77 deletions(-) diff --git a/R/TwoSampleMR_functions.R b/R/TwoSampleMR_functions.R index 051c614..b4f170b 100644 --- a/R/TwoSampleMR_functions.R +++ b/R/TwoSampleMR_functions.R @@ -713,17 +713,17 @@ proxy_search <- function(data_exposure, data_outcome, data_outcome_path, data_re outcome_sep, outcome_phenotype, outcome_SNP, outcome_BETA, outcome_SE, outcome_P, outcome_EA, outcome_OA, outcome_EAF, outcome_N, outcome_ID, outcome_CHR, outcome_POS) { - # Parameter Validation + # parameter Validation ==== if (!file.exists(data_outcome_path) || !file.exists(paste0(data_reference))) { stop("Invalid file paths provided.") } - if (!is.data.frame(data_exposure) || !is.data.frame(data_outcome)) { stop("data_exposure and data_outcome must be data frames.") } - # exposure snps missing from outcome ==== - snps_missing <- setdiff(unique(as.factor(data_exposure$SNP)), unique(as.factor(data_outcome$SNP))) + # identify missing SNPs in the outcome data ==== + snps_missing <- setdiff(unique(as.factor(data_exposure$SNP)), + unique(as.factor(data_outcome$SNP))) if (length(snps_missing) == 0) { message("No missing SNPs. Returning the original dataframe.") return(data_outcome) @@ -760,8 +760,7 @@ proxy_search <- function(data_exposure, data_outcome, data_outcome_path, data_re proxy_a1.outcome = B1, proxy_a2.outcome = B2, R) %>% - dplyr::mutate(proxy.outcome = TRUE, - SNP = proxy_snp.outcome) %>% + dplyr::mutate(proxy.outcome = TRUE) %>% dplyr::select(proxy.outcome, everything()) message(paste0("## proxy-SNP(s) for ", length(unique(proxies$target_snp.outcome)), " missing-SNP(s) found; ", "proxy-SNP(s) for ", length(unique(as.factor(snps_reference))) - length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) not available (e.g., no proxy-SNP or r2 < provided)")) @@ -785,7 +784,11 @@ proxy_search <- function(data_exposure, data_outcome, data_outcome_path, data_re id_col = outcome_ID, chr_col = outcome_CHR, pos_col = outcome_POS) - data_outcome_proxies <- dplyr::left_join(data_outcome_proxies, proxies, by = c("SNP" = "SNP")) + data_outcome_proxies <- data_outcome_proxies %>% + dplyr::left_join(proxies, by = c("SNP" = "proxy_snp.outcome")) %>% + dplyr::mutate(proxy_snp.outcome = SNP, # add proxy_snp.outcome back in (SNP is the proxies found in the outcome) + SNP = target_snp.outcome, # make SNP the original (i.e., missing) SNP + id.outcome = unique(data_outcome[["id.outcome"]])) # reassign the id.outcome string from the original dataframe message(paste0("## proxy-SNP(s) for ", length(unique(as.factor(data_outcome_proxies$target_snp.outcome))), " of ", length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) extracted")) # select proxy-SNP(s) with the highest R2 ==== @@ -940,7 +943,6 @@ proxy_search_split <- function(data_exposure, data_outcome, data_outcome_path, d return(data_outcome) } - #' finds proxy SNPs from an internal reference (DECODE format) #' #' @param data_exposure Data frame containing exposure SNP data. @@ -954,65 +956,88 @@ proxy_search_split <- function(data_exposure, data_outcome, data_outcome_path, d #' @param outcome_sep Delimiter used in the outcome data file. #' @return Data frame with proxy SNPs added. #' @export -proxy_search_DECODE <- function (data_exposure, data_outcome, data_outcome_path, data_reference, - data_reference_path, tag_r2 = 0.8, tag_kb = 5000, tag_nsnp = 5000, - outcome_sep) -{ +proxy_search_DECODE <- function(data_exposure, data_outcome, data_outcome_path, data_reference, + data_reference_path, tag_r2 = 0.8, tag_kb = 5000, tag_nsnp = 5000, + outcome_sep) { + + # parameter Validation ==== if (!file.exists(data_outcome_path) || !file.exists(paste0(data_reference))) { stop("Invalid file paths provided.") } if (!is.data.frame(data_exposure) || !is.data.frame(data_outcome)) { stop("data_exposure and data_outcome must be data frames.") } + + # identify missing SNPs in the outcome data ==== snps_missing <- setdiff(unique(as.factor(data_exposure$SNP)), unique(as.factor(data_outcome$SNP))) if (length(snps_missing) == 0) { message("No missing SNPs. Returning the original dataframe.") return(data_outcome) } - message(paste0("# 1. looking up ", length(unique(as.factor(snps_missing))), - " missing-SNP(s) in the reference panel")) - reference_header <- readLines(data_reference, n = 1) - column_index <- grep("rs", strsplit(reference_header, "\t")[[1]]) - cmd <- sprintf("awk '$%d ~ /^(%s)$/' %s", column_index, - paste(snps_missing, collapse = "|"), paste0(data_reference)) + + # look-up missing SNPs in reference panel ==== + message(paste0("# 1. looking up ", length(unique(as.factor(snps_missing))), " missing-SNP(s) in the reference panel")) + reference_header <- readLines(data_reference, n = 1) ## read the first row to identify the column containing "rs*" + column_index <- grep("rs", strsplit(reference_header, "\t")[[1]]) ## get column index + ## bash command to filter rows based on the identified column + cmd <- sprintf("awk '$%d ~ /^(%s)$/' %s", column_index, paste(snps_missing, collapse = "|"), paste0(data_reference)) + ## use system to run the bash command and then read the result with fread reference <- data.table::fread(cmd = cmd, quote = "") - snps_reference <- intersect(unique(as.factor(snps_missing)), - unique(as.factor(reference$V2))) - message(paste0("## ", length(unique(as.factor(snps_reference))), - " of ", length(unique(as.factor(snps_missing))), " missing-SNP(s) available in the reference panel")) - message(paste0("# 2. extracting proxy-SNP(s) for the ", - length(unique(as.factor(snps_reference))), " missing-SNP(s) from the reference panel")) + snps_reference <- intersect(unique(as.factor(snps_missing)), unique(as.factor(reference$V2))) + message(paste0("## ", length(unique(as.factor(snps_reference))), " of ", length(unique(as.factor(snps_missing))), " missing-SNP(s) available in the reference panel")) + + # find proxies for available SNPs ==== + message(paste0("# 2. extracting proxy-SNP(s) for the ", length(unique(as.factor(snps_reference))), " missing-SNP(s) from the reference panel")) gwasvcf::set_plink(genetics.binaRies::get_plink_binary()) proxies <- functions::get_ld_proxies(rsid = snps_missing, - bfile = data_reference_path, searchspace = NULL, tag_kb = tag_kb, - tag_nsnp = tag_nsnp, tag_r2 = tag_r2, threads = 1, out = tempfile()) - proxies <- proxies %>% dplyr::select(target_snp.outcome = SNP_A, - proxy_snp.outcome = SNP_B, target_a1.outcome = A1, target_a2.outcome = A2, - proxy_a1.outcome = B1, proxy_a2.outcome = B2, R) %>% - dplyr::mutate(proxy.outcome = TRUE, SNP = proxy_snp.outcome) %>% + bfile = data_reference_path, + searchspace = NULL, + tag_kb = tag_kb, + tag_nsnp = tag_nsnp, + tag_r2 = tag_r2, + threads = 1, + out = tempfile()) + ## format proxy data: change column order and names, add proxy.outcome = TRUE + proxies <- proxies %>% + dplyr::select(target_snp.outcome = SNP_A, + proxy_snp.outcome = SNP_B, + target_a1.outcome = A1, + target_a2.outcome = A2, + proxy_a1.outcome = B1, + proxy_a2.outcome = B2, + R) %>% + dplyr::mutate(proxy.outcome = TRUE) %>% dplyr::select(proxy.outcome, everything()) - message(paste0("## proxy-SNP(s) for ", length(unique(proxies$target_snp.outcome)), - " missing-SNP(s) found; ", "proxy-SNP(s) for ", length(unique(as.factor(snps_reference))) - - length(unique(as.factor(proxies$target_snp.outcome))), - " missing-SNP(s) not available (e.g., no proxy-SNP or r2 < provided)")) + message(paste0("## proxy-SNP(s) for ", length(unique(proxies$target_snp.outcome)), " missing-SNP(s) found; ", "proxy-SNP(s) for ", length(unique(as.factor(snps_reference))) - length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) not available (e.g., no proxy-SNP or r2 < provided)")) + + # extract proxies from outcome ==== message(paste0("# 3. extracting proxy-SNP(s) from outcome")) - proxy_snps <- proxies %>% dplyr::distinct(proxy_snp.outcome) %>% + proxy_snps <- proxies %>% + dplyr::distinct(proxy_snp.outcome) %>% dplyr::pull(proxy_snp.outcome) data_outcome_proxies <- fread(data_outcome_path, header = FALSE, sep = "\t", col.names = c("chr.outcome", "pos.outcome", "SNPID", "SNP", "A1", "A2", "beta.outcome", "pval.outcome", "min_log10_pval", "se.outcome", "samplesize.outcome", "ImpMAF", "outcome", "effect_allele.outcome", "other_allele.outcome", "eaf.outcome")) + data_outcome_proxies <- data_outcome_proxies %>% - dplyr::filter(SNP %in% proxy_snps) - data_outcome_proxies <- dplyr::left_join(data_outcome_proxies, - proxies, by = c(SNP = "SNP"), relationship = "many-to-many") - message(paste0("## proxy-SNP(s) for ", length(unique(as.factor(data_outcome_proxies$target_snp.outcome))), - " of ", length(unique(as.factor(proxies$target_snp.outcome))), - " missing-SNP(s) extracted")) - data_outcome_proxies <- data_outcome_proxies %>% dplyr::group_by(target_snp.outcome) %>% - dplyr::filter(R == max(R)) %>% dplyr::slice(1) %>% dplyr::select(-R) + # dplyr::filter(SNP %in% proxy_snps) %>% + dplyr::left_join(proxies, by = c("SNP" = "proxy_snp.outcome")) %>% + dplyr::mutate(proxy_snp.outcome = SNP, # add proxy_snp.outcome back in (SNP is the proxies found in the outcome) + SNP = target_snp.outcome, # make SNP the original (i.e., missing) SNP + id.outcome = unique(data_outcome[["id.outcome"]])) # reassign the id.outcome string from the original dataframe + message(paste0("## proxy-SNP(s) for ", length(unique(as.factor(data_outcome_proxies$target_snp.outcome))), " of ", length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) extracted")) + + # select proxy-SNP(s) with the highest R2 ==== + data_outcome_proxies <- data_outcome_proxies %>% + dplyr::group_by(target_snp.outcome) %>% + dplyr::filter(R == max(R)) %>% + dplyr::slice(1) %>% + dplyr::select(-R) + + # format data and bind with original outcome data ==== data_outcome_proxies <- format_data2( data_outcome_proxies, type = "outcome", @@ -1033,10 +1058,13 @@ proxy_search_DECODE <- function (data_exposure, data_outcome, data_outcome_path, min_pval = 1e-200, log_pval = FALSE ) + data_outcome <- dplyr::bind_rows(data_outcome, data_outcome_proxies) + return(data_outcome) } + #' finds proxy SNPs from an internal reference (UKB format) #' #' @param data_exposure Data frame containing exposure SNP data. @@ -1050,50 +1078,65 @@ proxy_search_DECODE <- function (data_exposure, data_outcome, data_outcome_path, #' @param outcome_sep Delimiter used in the outcome data file. #' @return Data frame with proxy SNPs added. #' @export -proxy_search_UKB <- function (data_exposure, data_outcome, data_outcome_path, data_reference, - data_reference_path, tag_r2 = 0.8, tag_kb = 5000, tag_nsnp = 5000, - outcome_sep) -{ +proxy_search_UKB <- function(data_exposure, data_outcome, data_outcome_path, data_reference, + data_reference_path, tag_r2 = 0.8, tag_kb = 5000, tag_nsnp = 5000, + outcome_sep) { + + # parameter Validation ==== if (!file.exists(data_outcome_path) || !file.exists(paste0(data_reference))) { stop("Invalid file paths provided.") } if (!is.data.frame(data_exposure) || !is.data.frame(data_outcome)) { stop("data_exposure and data_outcome must be data frames.") } + + # identify missing SNPs in the outcome data ==== snps_missing <- setdiff(unique(as.factor(data_exposure$SNP)), unique(as.factor(data_outcome$SNP))) if (length(snps_missing) == 0) { message("No missing SNPs. Returning the original dataframe.") return(data_outcome) } - message(paste0("# 1. looking up ", length(unique(as.factor(snps_missing))), - " missing-SNP(s) in the reference panel")) - reference_header <- readLines(data_reference, n = 1) - column_index <- grep("rs", strsplit(reference_header, "\t")[[1]]) - cmd <- sprintf("awk '$%d ~ /^(%s)$/' %s", column_index, - paste(snps_missing, collapse = "|"), paste0(data_reference)) + + # look-up missing SNPs in reference panel ==== + message(paste0("# 1. looking up ", length(unique(as.factor(snps_missing))), " missing-SNP(s) in the reference panel")) + reference_header <- readLines(data_reference, n = 1) ## read the first row to identify the column containing "rs*" + column_index <- grep("rs", strsplit(reference_header, "\t")[[1]]) ## get column index + ## bash command to filter rows based on the identified column + cmd <- sprintf("awk '$%d ~ /^(%s)$/' %s", column_index, paste(snps_missing, collapse = "|"), paste0(data_reference)) + ## use system to run the bash command and then read the result with fread reference <- data.table::fread(cmd = cmd, quote = "") - snps_reference <- intersect(unique(as.factor(snps_missing)), - unique(as.factor(reference$V2))) - message(paste0("## ", length(unique(as.factor(snps_reference))), - " of ", length(unique(as.factor(snps_missing))), " missing-SNP(s) available in the reference panel")) - message(paste0("# 2. extracting proxy-SNP(s) for the ", - length(unique(as.factor(snps_reference))), " missing-SNP(s) from the reference panel")) + snps_reference <- intersect(unique(as.factor(snps_missing)), unique(as.factor(reference$V2))) + message(paste0("## ", length(unique(as.factor(snps_reference))), " of ", length(unique(as.factor(snps_missing))), " missing-SNP(s) available in the reference panel")) + + # find proxies for available SNPs ==== + message(paste0("# 2. extracting proxy-SNP(s) for the ", length(unique(as.factor(snps_reference))), " missing-SNP(s) from the reference panel")) gwasvcf::set_plink(genetics.binaRies::get_plink_binary()) proxies <- functions::get_ld_proxies(rsid = snps_missing, - bfile = data_reference_path, searchspace = NULL, tag_kb = tag_kb, - tag_nsnp = tag_nsnp, tag_r2 = tag_r2, threads = 1, out = tempfile()) - proxies <- proxies %>% dplyr::select(target_snp.outcome = SNP_A, - proxy_snp.outcome = SNP_B, target_a1.outcome = A1, target_a2.outcome = A2, - proxy_a1.outcome = B1, proxy_a2.outcome = B2, R) %>% - dplyr::mutate(proxy.outcome = TRUE, SNP = proxy_snp.outcome) %>% + bfile = data_reference_path, + searchspace = NULL, + tag_kb = tag_kb, + tag_nsnp = tag_nsnp, + tag_r2 = tag_r2, + threads = 1, + out = tempfile()) + ## format proxy data: change column order and names, add proxy.outcome = TRUE + proxies <- proxies %>% + dplyr::select(target_snp.outcome = SNP_A, + proxy_snp.outcome = SNP_B, + target_a1.outcome = A1, + target_a2.outcome = A2, + proxy_a1.outcome = B1, + proxy_a2.outcome = B2, + R) %>% + dplyr::mutate(proxy.outcome = TRUE) %>% dplyr::select(proxy.outcome, everything()) - message(paste0("## proxy-SNP(s) for ", length(unique(proxies$target_snp.outcome)), - " missing-SNP(s) found; ", "proxy-SNP(s) for ", length(unique(as.factor(snps_reference))) - - length(unique(as.factor(proxies$target_snp.outcome))), - " missing-SNP(s) not available (e.g., no proxy-SNP or r2 < provided)")) + message(paste0("## proxy-SNP(s) for ", length(unique(proxies$target_snp.outcome)), " missing-SNP(s) found; ", "proxy-SNP(s) for ", length(unique(as.factor(snps_reference))) - length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) not available (e.g., no proxy-SNP or r2 < provided)")) + + # extract proxies from outcome ==== message(paste0("# 3. extracting proxy-SNP(s) from outcome")) - proxy_snps <- proxies %>% dplyr::distinct(proxy_snp.outcome) %>% + proxy_snps <- proxies %>% + dplyr::distinct(proxy_snp.outcome) %>% dplyr::pull(proxy_snp.outcome) data_outcome_proxies <- fread(data_outcome_path, header = FALSE, @@ -1102,14 +1145,22 @@ proxy_search_UKB <- function (data_exposure, data_outcome, data_outcome_path, da dplyr::filter(SNP %in% proxy_snps) data_outcome_proxies <- separate(data_outcome_proxies, ID, into = c("chr.outcome", "pos.outcome", "ID", "other_allele.outcome", "effect_allele.outcome", "eaf.outcome", "INFO", "samplesize.outcome", "TEST", "beta.outcome", "se.outcome", "CHISQ", "LOG10P", "EXTRA", "outcome", "ID2"), sep = " ") - data_outcome_proxies <- dplyr::left_join(data_outcome_proxies, - proxies, by = c(SNP = "SNP"), relationship = "many-to-many") - message(paste0("## proxy-SNP(s) for ", length(unique(as.factor(data_outcome_proxies$target_snp.outcome))), - " of ", length(unique(as.factor(proxies$target_snp.outcome))), - " missing-SNP(s) extracted")) - data_outcome_proxies <- data_outcome_proxies %>% dplyr::group_by(target_snp.outcome) %>% - dplyr::filter(R == max(R)) %>% dplyr::slice(1) %>% dplyr::select(-R) + data_outcome_proxies <- data_outcome_proxies %>% + # dplyr::filter(SNP %in% proxy_snps) %>% + dplyr::left_join(proxies, by = c("SNP" = "proxy_snp.outcome")) %>% + dplyr::mutate(proxy_snp.outcome = SNP, # add proxy_snp.outcome back in (SNP is the proxies found in the outcome) + SNP = target_snp.outcome, # make SNP the original (i.e., missing) SNP + id.outcome = unique(data_outcome[["id.outcome"]])) # reassign the id.outcome string from the original dataframe + message(paste0("## proxy-SNP(s) for ", length(unique(as.factor(data_outcome_proxies$target_snp.outcome))), " of ", length(unique(as.factor(proxies$target_snp.outcome))), " missing-SNP(s) extracted")) + # select proxy-SNP(s) with the highest R2 ==== + data_outcome_proxies <- data_outcome_proxies %>% + dplyr::group_by(target_snp.outcome) %>% + dplyr::filter(R == max(R)) %>% + dplyr::slice(1) %>% + dplyr::select(-R) + + # format data and bind with original outcome data ==== data_outcome_proxies <- format_data2( data_outcome_proxies, type = "outcome", @@ -1132,6 +1183,7 @@ proxy_search_UKB <- function (data_exposure, data_outcome, data_outcome_path, da ) data_outcome <- dplyr::bind_rows(data_outcome, data_outcome_proxies) + return(data_outcome) }