From 7a0f6032f2be3d79b87e13e4db6e86fd2bcd5637 Mon Sep 17 00:00:00 2001 From: cag134392 <49640128+cag134392@users.noreply.github.com> Date: Tue, 28 Feb 2023 10:38:11 +0100 Subject: [PATCH] Update prepareLinksDatabase.R Update for R.4.1.0 > functionality --- R/prepareLinksDatabase.R | 152 +++++++++++++++++++++++++-------------- 1 file changed, 99 insertions(+), 53 deletions(-) diff --git a/R/prepareLinksDatabase.R b/R/prepareLinksDatabase.R index fc0b9d4..e89ab5b 100644 --- a/R/prepareLinksDatabase.R +++ b/R/prepareLinksDatabase.R @@ -20,99 +20,129 @@ prepareLinksDatabase <- function(annotation, tss.window, tes.window) { txdb <- GenomicFeatures::makeTxDbFromGRanges(annotation) ebt <- GenomicFeatures::exonsBy(txdb, by = "tx", use.names = TRUE) t2g <- AnnotationDbi::select(txdb, - keys = names(ebt), - keytype = "TXNAME", - columns = "GENEID") + keys = names(ebt), + keytype = "TXNAME", + columns = "GENEID" + ) e2 <- BiocGenerics::unlist(ebt) e2$transcript_id <- names(e2) - e2$gene_id = t2g$GENEID[match(e2$transcript_id, t2g$TXNAME)] + e2$gene_id <- t2g$GENEID[match(e2$transcript_id, t2g$TXNAME)] e2$exon_id <- e2$exon_name e2$exon_name <- NULL e2$type <- "exon" names(e2) <- NULL - mcols(e2) <- mcols(e2)[, c("exon_id", "exon_rank", - "transcript_id", "gene_id", "type")] + mcols(e2) <- mcols(e2)[, c( + "exon_id", "exon_rank", + "transcript_id", "gene_id", "type" + )] bins <- list() # TSS data base # take first position per transcript and make it single nt - tss.bins <- - strandSort(plyranges::mutate(plyranges::anchor_5p(dplyr::filter(e2, exon_rank == - 1)), width = 1)) + tss.bins <- + strandSort( + plyranges::mutate( + plyranges::anchor_5p( + dplyr::filter(e2, exon_rank == 1)), width = 1)) # make unique TSS starts merging in a 50nt window. - cat("Preparing TSSBase\n") + message("Preparing TSSBase") tss.base <- strandSort( GenomicRanges::makeGRangesFromDataFrame( reshape::melt(GenomicRanges::reduce( GenomicRanges::split(tss.bins, ~gene_id), - min.gapwidth = tss.window)), + min.gapwidth = tss.window + )), keep.extra.columns = TRUE ) ) bins$tss.bins <- tss.bins bins$tss.base <- tss.base tss.base <- - tibble::as_tibble(tss.base) %>% dplyr::group_by(value.group_name) %>% - dplyr::mutate(count = paste0(value.group_name, ":P", sprintf("%02d", sequence(dplyr::n( - ))))) %>% + tibble::as_tibble(tss.base) %>% + dplyr::group_by(value.group_name) %>% + dplyr::mutate( + count = paste0(value.group_name, ":P", + sprintf("%02d", sequence(dplyr::n())))) %>% GenomicRanges::makeGRangesFromDataFrame(., keep.extra.columns = TRUE) # annotate isoforms with promoter_id ii <- GenomicRanges::findOverlaps(tss.bins, tss.base, maxgap = tss.window - 1) tss.bins.annot <- - GenomicRanges::makeGRangesFromDataFrame(rbind(data.frame(tss.bins[S4Vectors::queryHits(ii)], - tss.base[S4Vectors::subjectHits(ii)])), keep.extra.columns = TRUE) + GenomicRanges::makeGRangesFromDataFrame(rbind(data.frame( + tss.bins[S4Vectors::queryHits(ii)], + tss.base[S4Vectors::subjectHits(ii)] + )), keep.extra.columns = TRUE) tss.bins.annot <- - tss.bins.annot %>% dplyr::mutate(transcript_id = transcript_id, promoter_id = - count) %>% + tss.bins.annot %>% + dplyr::mutate( + transcript_id = transcript_id, + promoter_id = count + ) %>% plyranges::select(gene_id, transcript_id, promoter_id) # TES data base # last exon per transcript le <- GenomicRanges::makeGRangesFromDataFrame( - e2 %>% group_by(transcript_id) %>% dplyr::filter(exon_rank == max(exon_rank)), + e2 %>% + as.data.frame(.) %>% + group_by(transcript_id) %>% + dplyr::filter(exon_rank == max(exon_rank)), keep.extra.columns = TRUE ) - tes.bins <- - strandSort(plyranges::mutate(plyranges::anchor_3p(le), width = 1)) - # make unique TSS starts merging in a 50nt window. - cat("PrepEndBase") + tes.bins <- strandSort( + plyranges::mutate(plyranges::anchor_3p(le), width = 1)) + # make unique TES starts merging in a 50nt window. + message("Preparing TES database") tes.base <- - strandSort ( + strandSort( GenomicRanges::makeGRangesFromDataFrame( - reshape::melt(GenomicRanges::reduce(( - GenomicRanges::split(tes.bins, ~ gene_id) - ), - min.gapwidth = - tes.window)), + reshape::melt(GenomicRanges::reduce( + ( + GenomicRanges::split(tes.bins, ~gene_id) + ), + min.gapwidth = + tes.window + )), keep.extra.columns = TRUE ) ) tes.base <- - tibble::as.tibble(tes.base) %>% dplyr::group_by(value.group_name) %>% - dplyr::mutate(count = paste0(value.group_name, ":TE", sprintf("%02d", sequence(dplyr::n( - ))))) %>% + tibble::as.tibble(tes.base) %>% + dplyr::group_by(value.group_name) %>% + dplyr::mutate( + count = paste0(value.group_name, ":TE", + sprintf("%02d", sequence(dplyr::n())))) %>% GenomicRanges::makeGRangesFromDataFrame(., keep.extra.columns = TRUE) # assign tes_ids to isoforms + message("Assign TES to isoforms") ii <- findOverlaps(tes.bins, tes.base, maxgap = tes.window - 1) tes.bins.annot <- - GenomicRanges::makeGRangesFromDataFrame(rbind(data.frame(tes.bins[S4Vectors::queryHits(ii)], - tes.base[S4Vectors::subjectHits(ii)])), keep.extra.columns = TRUE) + GenomicRanges::makeGRangesFromDataFrame(rbind(data.frame( + tes.bins[S4Vectors::queryHits(ii)], + tes.base[S4Vectors::subjectHits(ii)] + )), keep.extra.columns = TRUE) tes.bins.annot <- - tes.bins.annot %>% dplyr::mutate(transcript_id = transcript_id, tes_id =count) %>% + tes.bins.annot %>% + dplyr::mutate(transcript_id = transcript_id, + tes_id = count) %>% plyranges::select(gene_id, transcript_id, tes_id) bins$tes.bins <- tes.bins bins$tes.base <- tes.base # create link database - linksDbs <- dplyr::left_join(as.data.frame(tes.bins.annot), - as.data.frame(tss.bins.annot), - by = "transcript_id") %>% - dplyr::select(gene_id.x, transcript_id, promoter_id, tes_id) %>% dplyr::rename(gene_id = gene_id.x) %>% - dplyr::mutate(pairs_id = paste(promoter_id, gsub(".*:", "", tes_id), sep =":")) + message("Create TSS/TES links database") + linksDbs <- dplyr::left_join( + as.data.frame(tes.bins.annot), + as.data.frame(tss.bins.annot), + by = "transcript_id" + ) %>% + dplyr::select(gene_id.x, transcript_id, promoter_id, tes_id) %>% + dplyr::rename(gene_id = gene_id.x) %>% + dplyr::mutate(pairs_id = paste(promoter_id, gsub(".*:", "", tes_id), sep = ":")) # Classify promoter type and utr type # sorted with proximal first distal later # output: class all 3'end isoforms le.sort <- strandSort(le) - tt <- linksDbs %>% dplyr::group_by(gene_id) %>% + tt <- linksDbs %>% + dplyr::group_by(gene_id) %>% dplyr::mutate( utr_type = dplyr::case_when( tes_id == min(tes_id) ~ "proximal", @@ -120,7 +150,8 @@ prepareLinksDatabase <- function(annotation, tss.window, tes.window) { TRUE ~ "other" ) ) - tt <- tt %>% group_by(gene_id) %>% + tt <- tt %>% + group_by(gene_id) %>% dplyr::mutate( promoter_type = case_when( promoter_id == min(promoter_id) ~ "distal", @@ -130,10 +161,17 @@ prepareLinksDatabase <- function(annotation, tss.window, tes.window) { ) # classify APA-ATSS genes # genes with more than 1 promoter different promoter - atss.gene <- tt %>% dplyr::distinct(promoter_id, .keep_all = TRUE) %>% - dplyr::group_by(gene_id) %>% dplyr::filter(dplyr::n() > 1) %>% dplyr::pull(gene_id) - apa.gene <- tt %>% dplyr::distinct(tes_id, .keep_all = TRUE) %>% - dplyr::group_by(gene_id) %>% dplyr::filter(dplyr::n() > 1) %>% dplyr::pull(gene_id) + message("Classify genes") + atss.gene <- tt %>% + dplyr::distinct(promoter_id, .keep_all = TRUE) %>% + dplyr::group_by(gene_id) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::pull(gene_id) + apa.gene <- tt %>% + dplyr::distinct(tes_id, .keep_all = TRUE) %>% + dplyr::group_by(gene_id) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::pull(gene_id) tt <- tt %>% dplyr::mutate( tss.status = ifelse(gene_id %in% atss.gene, "ATSS", "single_promoter"), @@ -147,7 +185,8 @@ prepareLinksDatabase <- function(annotation, tss.window, tes.window) { tt <- subset(tt, gene_id == promoter_gene) tt$promoter_gene <- NULL bins$links <- tt - tt <- tt %>% dplyr::group_by(gene_id) %>% + tt <- tt %>% + dplyr::group_by(gene_id) %>% dplyr::mutate( utr_type = dplyr::case_when( tes_id == min(tes_id) ~ "proximal", @@ -155,7 +194,8 @@ prepareLinksDatabase <- function(annotation, tss.window, tes.window) { TRUE ~ "other" ) ) - tt <- tt %>% group_by(gene_id) %>% + tt <- tt %>% + group_by(gene_id) %>% dplyr::mutate( promoter_type = case_when( promoter_id == min(promoter_id) ~ "distal", @@ -163,10 +203,16 @@ prepareLinksDatabase <- function(annotation, tss.window, tes.window) { TRUE ~ "intermediate" ) ) - atss.gene <- tt %>% dplyr::distinct(promoter_id, .keep_all = TRUE) %>% - dplyr::group_by(gene_id) %>% dplyr::filter(dplyr::n() > 1) %>% dplyr::pull(gene_id) - apa.gene <- tt %>% dplyr::distinct(tes_id, .keep_all = TRUE) %>% - dplyr::group_by(gene_id) %>% dplyr::filter(dplyr::n() > 1) %>% dplyr::pull(gene_id) + atss.gene <- tt %>% + dplyr::distinct(promoter_id, .keep_all = TRUE) %>% + dplyr::group_by(gene_id) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::pull(gene_id) + apa.gene <- tt %>% + dplyr::distinct(tes_id, .keep_all = TRUE) %>% + dplyr::group_by(gene_id) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::pull(gene_id) tt <- tt %>% dplyr::mutate( tss.status = ifelse(gene_id %in% atss.gene, "ATSS", "single_promoter"), @@ -178,6 +224,6 @@ prepareLinksDatabase <- function(annotation, tss.window, tes.window) { result$TESCoordinate.base <- tes.base result$TSSCoordinate.bins <- tss.bins result$TSSCoordinate.base <- tss.base + message("Database Sucessfully created") return(result) } -