From a4093efbd9622402e1f017ab1b1f61ec756f80eb Mon Sep 17 00:00:00 2001 From: Kent Riemondy Date: Wed, 11 Oct 2023 11:53:42 -0600 Subject: [PATCH] add genomecov (#411) * add genomecov functionality, closes #410 * change column output, depth -> .depth, for consistency with other valr outputs * styler::style_pkg() * fix note --- NAMESPACE | 1 + NEWS.md | 2 + R/RcppExports.R | 4 + R/bed_genomecov.R | 121 +++++++++++++++++++ R/globals.r | 2 +- bench/benchmarks.Rmd | 3 + man/bed_cluster.Rd | 1 + man/bed_complement.Rd | 1 + man/bed_flank.Rd | 1 + man/bed_genomecov.Rd | 68 +++++++++++ man/bed_merge.Rd | 1 + man/bed_partition.Rd | 1 + man/bed_shift.Rd | 1 + man/bed_slop.Rd | 1 + pkgdown/_pkgdown.yml | 1 + src/RcppExports.cpp | 12 ++ src/gcoverage.cpp | 133 +++++++++++++++++++++ src/init.c | 2 + tests/testthat/test_genomecov.R | 203 ++++++++++++++++++++++++++++++++ vignettes/valr.Rmd | 6 +- 20 files changed, 561 insertions(+), 4 deletions(-) create mode 100644 R/bed_genomecov.R create mode 100644 man/bed_genomecov.Rd create mode 100644 src/gcoverage.cpp create mode 100644 tests/testthat/test_genomecov.R diff --git a/NAMESPACE b/NAMESPACE index 6be75047..3e092839 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(bed_complement) export(bed_coverage) export(bed_fisher) export(bed_flank) +export(bed_genomecov) export(bed_glyph) export(bed_intersect) export(bed_jaccard) diff --git a/NEWS.md b/NEWS.md index b02d6ade..db013fa1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # valr (development version) +* Added `bed_genomecov()` to compute interval coverage across a genome. + # valr 0.7.0 * `read_bed` and related functions now automatically calculate the fields. Use of `n_fields` was deprecated. diff --git a/R/RcppExports.R b/R/RcppExports.R index 9e702ecb..b4d14975 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -25,6 +25,10 @@ flank_impl <- function(df, genome, both = 0, left = 0, right = 0, fraction = FAL .Call(`_valr_flank_impl`, df, genome, both, left, right, fraction, stranded, trim) } +gcoverage_impl <- function(gdf, max_coords) { + .Call(`_valr_gcoverage_impl`, gdf, max_coords) +} + intersect_impl <- function(x, y, x_grp_indexes, y_grp_indexes, invert = FALSE, suffix_x = ".x", suffix_y = ".y") { .Call(`_valr_intersect_impl`, x, y, x_grp_indexes, y_grp_indexes, invert, suffix_x, suffix_y) } diff --git a/R/bed_genomecov.R b/R/bed_genomecov.R new file mode 100644 index 00000000..dbe03b4f --- /dev/null +++ b/R/bed_genomecov.R @@ -0,0 +1,121 @@ +#' Calculate coverage across a genome +#' +#' This function is useful for calculating interval coverage across an entire genome. +#' +#' @param x [ivl_df] +#' @param genome [genome_df] +#' @param zero_depth If TRUE, report intervals with zero depth. Zero depth intervals will +#' be reported with respect to groups. +#' +#' @template groups +#' +#' @return +#' [ivl_df] with the an additional column: +#' +#' - `.depth` depth of interval coverage +#' +#' @family single set operations +#' +#' @seealso +#' \url{https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html} +#' +#' @examples +#' x <- tibble::tribble( +#' ~chrom, ~start, ~end, ~strand, +#' "chr1", 20, 70, "+", +#' "chr1", 50, 100, "-", +#' "chr1", 200, 250, "+", +#' "chr1", 220, 250, "+" +#' ) +#' +#' genome <- tibble::tribble( +#' ~chrom, ~size, +#' "chr1", 500, +#' "chr2", 1000 +#' ) +#' +#' bed_genomecov(x, genome) +#' +#' bed_genomecov(dplyr::group_by(x, strand), genome) +#' +#' bed_genomecov(dplyr::group_by(x, strand), genome, zero_depth = TRUE) +#' +#' @export +bed_genomecov <- function(x, genome, zero_depth = FALSE) { + check_required(x) + check_required(genome) + + x <- check_interval(x) + genome <- check_genome(genome) + + non_genome_chroms <- setdiff(unique(x$chrom), genome$chrom) + if (length(non_genome_chroms) > 0) { + cli::cli_warn(c( + paste0( + "The following chromosomes in bed intervals are not", + " in the genome and will be ignored:" + ), + paste0( + non_genome_chroms, + sep = "\n" + ) + )) + x <- x[!x[["chrom"]] %in% non_genome_chroms, ] + } + + grp_cols <- group_vars(x) + x <- bed_sort(x) + + groups <- rlang::syms(unique(c("chrom", grp_cols))) + x <- group_by(x, !!!groups) + + max_coords <- group_chrom_sizes(x, genome) + + res <- gcoverage_impl(x, max_coords) + res <- tibble::as_tibble(res) + + # drop non-grouped cols as values no longer match ivls + res <- select(res, chrom, start, end, one_of(grp_cols), .depth) + + if (!zero_depth) { + res <- res[res[[".depth"]] > 0, ] + } else { + # handle any missing chromosome, zero depth intervals handled on cpp side + missing_chroms <- setdiff(genome$chrom, group_data(x)$chrom) + if (length(missing_chroms) > 0) { + missing_chrom_ivls <- genome[genome[["chrom"]] %in% missing_chroms, ] + missing_chrom_ivls[["start"]] <- 0L + missing_chrom_ivls[[".depth"]] <- 0L + missing_chrom_ivls <- select(missing_chrom_ivls, chrom, start, end = size, .depth) + + if (length(groups) > 1) { + missing_chrom_ivls <- fill_missing_grouping(missing_chrom_ivls, x) + } + res <- bind_rows(res, missing_chrom_ivls) + res <- bed_sort(res) + } + } + + res +} + +# return chrom sizes for each group +group_chrom_sizes <- function(x, genome) { + xx <- left_join(group_data(x), genome, by = "chrom") + xx[["size"]] +} + +# expand df to contain non-chrom groups from grp_df +fill_missing_grouping <- function(df, grp_df) { + grps <- get_group_data(grp_df) + grps <- grps[, setdiff(colnames(grps), "chrom")] + + # expand df rows for new groups + df_grown <- df[rep(seq_len(nrow(df)), nrow(grps)), ] + # expand groups df for new groups + grp_grown <- grps[rep(seq_len(nrow(grps)), nrow(df)), ] + + stopifnot(nrow(df_grown) == nrow(grp_grown)) + res <- bind_cols(df_grown, grp_grown) + res +} diff --git a/R/globals.r b/R/globals.r index 70ca5f16..c362a231 100644 --- a/R/globals.r +++ b/R/globals.r @@ -19,5 +19,5 @@ globalVariables(c( "p.value", ".win_size", ".row_id", "cds_start", "cds_end", "score", "n_int", "sum_overlap", "sum_x", "sum_xy", "sum_y", - "temp_start", "temp_end", "seqnames" + "temp_start", "temp_end", "seqnames", ".depth" )) diff --git a/bench/benchmarks.Rmd b/bench/benchmarks.Rmd index dce85a85..7e4e32e0 100644 --- a/bench/benchmarks.Rmd +++ b/bench/benchmarks.Rmd @@ -59,6 +59,7 @@ res <- mark( bed_partition(x), bed_cluster(x), bed_complement(x, genome), + bed_genomecov(x, genome), # multi tbl functions bed_closest(x, y), bed_intersect(x, y), @@ -141,6 +142,7 @@ res2 <- mark( bed_subtract(x, y), bed_makewindows(x, win_size = 100), bed_sort(z), + bed_genomecov(x, genome), # GRanges flank(x_gr, 1000), shift(x_gr,0), @@ -151,6 +153,7 @@ res2 <- mark( setdiff(x_gr, y_gr), tile(x_gr, width = 100), sort(z_gr), + coverage(x_gr), min_time = Inf, iterations = nrep, check = FALSE, diff --git a/man/bed_cluster.Rd b/man/bed_cluster.Rd index c88708f2..53f69276 100644 --- a/man/bed_cluster.Rd +++ b/man/bed_cluster.Rd @@ -58,6 +58,7 @@ bed_glyph(bed_cluster(x), label = ".id") Other single set operations: \code{\link{bed_complement}()}, \code{\link{bed_flank}()}, +\code{\link{bed_genomecov}()}, \code{\link{bed_merge}()}, \code{\link{bed_partition}()}, \code{\link{bed_shift}()}, diff --git a/man/bed_complement.Rd b/man/bed_complement.Rd index 0430a41e..417cb526 100644 --- a/man/bed_complement.Rd +++ b/man/bed_complement.Rd @@ -55,6 +55,7 @@ bed_complement(x, genome) Other single set operations: \code{\link{bed_cluster}()}, \code{\link{bed_flank}()}, +\code{\link{bed_genomecov}()}, \code{\link{bed_merge}()}, \code{\link{bed_partition}()}, \code{\link{bed_shift}()}, diff --git a/man/bed_flank.Rd b/man/bed_flank.Rd index db58b866..75e5c91b 100644 --- a/man/bed_flank.Rd +++ b/man/bed_flank.Rd @@ -81,6 +81,7 @@ bed_flank(x, genome, both = 0.5, fraction = TRUE) Other single set operations: \code{\link{bed_cluster}()}, \code{\link{bed_complement}()}, +\code{\link{bed_genomecov}()}, \code{\link{bed_merge}()}, \code{\link{bed_partition}()}, \code{\link{bed_shift}()}, diff --git a/man/bed_genomecov.Rd b/man/bed_genomecov.Rd new file mode 100644 index 00000000..d8da91cc --- /dev/null +++ b/man/bed_genomecov.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bed_genomecov.R +\name{bed_genomecov} +\alias{bed_genomecov} +\title{Calculate coverage across a genome} +\usage{ +bed_genomecov(x, genome, zero_depth = FALSE) +} +\arguments{ +\item{x}{\link{ivl_df}} + +\item{genome}{\link{genome_df}} + +\item{zero_depth}{If TRUE, report intervals with zero depth. Zero depth intervals will +be reported with respect to groups.} +} +\value{ +\link{ivl_df} with the an additional column: +\itemize{ +\item \code{.depth} depth of interval coverage +} +} +\description{ +This function is useful for calculating interval coverage across an entire genome. +} +\details{ +input tbls are grouped by \code{chrom} by default, and additional +groups can be added using \code{\link[dplyr:group_by]{dplyr::group_by()}}. For example, +grouping by \code{strand} will constrain analyses to the same strand. To +compare opposing strands across two tbls, strands on the \code{y} tbl can +first be inverted using \code{\link[=flip_strands]{flip_strands()}}. +} +\examples{ +x <- tibble::tribble( + ~chrom, ~start, ~end, ~strand, + "chr1", 20, 70, "+", + "chr1", 50, 100, "-", + "chr1", 200, 250, "+", + "chr1", 220, 250, "+" +) + +genome <- tibble::tribble( + ~chrom, ~size, + "chr1", 500, + "chr2", 1000 +) + +bed_genomecov(x, genome) + +bed_genomecov(dplyr::group_by(x, strand), genome) + +bed_genomecov(dplyr::group_by(x, strand), genome, zero_depth = TRUE) + + +} +\seealso{ +\url{https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html} + +Other single set operations: +\code{\link{bed_cluster}()}, +\code{\link{bed_complement}()}, +\code{\link{bed_flank}()}, +\code{\link{bed_merge}()}, +\code{\link{bed_partition}()}, +\code{\link{bed_shift}()}, +\code{\link{bed_slop}()} +} +\concept{single set operations} diff --git a/man/bed_merge.Rd b/man/bed_merge.Rd index 0220acd6..a93ded13 100644 --- a/man/bed_merge.Rd +++ b/man/bed_merge.Rd @@ -66,6 +66,7 @@ Other single set operations: \code{\link{bed_cluster}()}, \code{\link{bed_complement}()}, \code{\link{bed_flank}()}, +\code{\link{bed_genomecov}()}, \code{\link{bed_partition}()}, \code{\link{bed_shift}()}, \code{\link{bed_slop}()} diff --git a/man/bed_partition.Rd b/man/bed_partition.Rd index a155a3b6..ac1d0687 100644 --- a/man/bed_partition.Rd +++ b/man/bed_partition.Rd @@ -75,6 +75,7 @@ Other single set operations: \code{\link{bed_cluster}()}, \code{\link{bed_complement}()}, \code{\link{bed_flank}()}, +\code{\link{bed_genomecov}()}, \code{\link{bed_merge}()}, \code{\link{bed_shift}()}, \code{\link{bed_slop}()} diff --git a/man/bed_shift.Rd b/man/bed_shift.Rd index 9b4d69d5..84996705 100644 --- a/man/bed_shift.Rd +++ b/man/bed_shift.Rd @@ -70,6 +70,7 @@ Other single set operations: \code{\link{bed_cluster}()}, \code{\link{bed_complement}()}, \code{\link{bed_flank}()}, +\code{\link{bed_genomecov}()}, \code{\link{bed_merge}()}, \code{\link{bed_partition}()}, \code{\link{bed_slop}()} diff --git a/man/bed_slop.Rd b/man/bed_slop.Rd index 4d1ed7aa..fc2dfb63 100644 --- a/man/bed_slop.Rd +++ b/man/bed_slop.Rd @@ -82,6 +82,7 @@ Other single set operations: \code{\link{bed_cluster}()}, \code{\link{bed_complement}()}, \code{\link{bed_flank}()}, +\code{\link{bed_genomecov}()}, \code{\link{bed_merge}()}, \code{\link{bed_partition}()}, \code{\link{bed_shift}()} diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index 881f9ed8..4e0462ba 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -30,6 +30,7 @@ reference: - bed_cluster - bed_complement - bed_flank + - bed_genomecov - bed_merge - bed_partition - bed_slop diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 90f2c090..6122e492 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -97,6 +97,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// gcoverage_impl +DataFrame gcoverage_impl(const ValrGroupedDataFrame& gdf, const IntegerVector& max_coords); +RcppExport SEXP _valr_gcoverage_impl(SEXP gdfSEXP, SEXP max_coordsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const ValrGroupedDataFrame& >::type gdf(gdfSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type max_coords(max_coordsSEXP); + rcpp_result_gen = Rcpp::wrap(gcoverage_impl(gdf, max_coords)); + return rcpp_result_gen; +END_RCPP +} // intersect_impl DataFrame intersect_impl(ValrGroupedDataFrame x, ValrGroupedDataFrame y, IntegerVector x_grp_indexes, IntegerVector y_grp_indexes, bool invert, const std::string& suffix_x, const std::string& suffix_y); RcppExport SEXP _valr_intersect_impl(SEXP xSEXP, SEXP ySEXP, SEXP x_grp_indexesSEXP, SEXP y_grp_indexesSEXP, SEXP invertSEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP) { diff --git a/src/gcoverage.cpp b/src/gcoverage.cpp new file mode 100644 index 00000000..24c03267 --- /dev/null +++ b/src/gcoverage.cpp @@ -0,0 +1,133 @@ +// gcoverage.cpp +// +// Copyright (C) 2023 Jay Hesselberth and Kent Riemondy +// +// This file is part of valr. +// +// This software may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. + +#include "valr.h" + +typedef std::pair posTrack; +typedef std::vector posTracker; + +// sort starts and ends while encoding coverage value +// to sum for overlapping intervals +posTracker collatePositions(const IntegerVector& starts, + const IntegerVector& ends) { + + posTracker p; + auto n = starts.size() ; + + if (n != ends.size()) { + stop("incompatible start and end vector supplied") ; + } + + for (int i = 0; i < n; i++) { + auto s = starts[i]; + auto e = ends[i]; + p.push_back(posTrack(s, 1)); + p.push_back(posTrack(e, -1)); + } + + std::sort(p.begin(), p.end()) ; + + return p; +} + +//[[Rcpp::export]] +DataFrame gcoverage_impl(const ValrGroupedDataFrame& gdf, + const IntegerVector& max_coords) { + + auto ng = gdf.ngroups() ; + DataFrame df = gdf.data() ; + + ListView idx(gdf.indices()) ; + + if(max_coords.size() != ng) { + stop("max_coords must equal the number of groups in data.frame"); + } + + std::vector out_indices, depths, starts, ends; + + for (int i = 0; i < ng; i++) { + + auto max_coord = max_coords[i]; + + IntegerVector indices = idx[i]; + indices = indices - 1; + + IntegerVector qstarts = df["start"] ; + IntegerVector qends = df["end"] ; + qstarts = qstarts[indices]; + qends = qends[indices]; + + // track single index of a representative interval + // this will be used to recover grouped column values + int grp_idx = indices[0]; + + auto pos = collatePositions(qstarts, qends); + + int prev_pos = 0; + int cvg = 0; + + for (auto p:pos) { + if (p.first > max_coord) { + warning("Out of bounds interval detected at position: %s \n" + " Out of bounds intervals will be ignored", + p.first); + break; + } + + if (p.first == prev_pos) { + cvg += p.second; + continue; + } + depths.push_back(cvg); + starts.push_back(prev_pos); + ends.push_back(p.first); + out_indices.push_back(grp_idx); + + prev_pos = p.first; + cvg += p.second; + } + + if(ends.back() < max_coord) { + depths.push_back(0); + starts.push_back(ends.back()); + ends.push_back(max_coord); + out_indices.push_back(grp_idx); + } + + } + + // subset original dataframe, note that the grouped column values will be correct + // but non-grouped column values are no longer matched to each interval and + // must be dropped on the R side + DataFrame subset_x = subset_dataframe(df, out_indices) ; + + subset_x["start"] = starts ; + subset_x["end"] = ends ; + subset_x[".depth"] = depths ; + + return subset_x ; +} + +/*** R +library(dplyr) +x <- tibble::tribble( + ~chrom, ~start, ~end, ~name, ~score, ~strand, + "chr1", 20, 70, 6, 25, "+", + "chr1", 50, 100, 1, 25, "-", + "chr1", 200, 250, 3, 25, "+", + "chr1", 220, 250, 3, 25, "+", + "chr2", 80, 130, 5, 25, "-", + "chr2", 150, 200, 4, 25, "+", + "chr2", 180, 230, 2, 25, "-", + "chr2", 190, 230, 2, 25, "-" +) |> group_by(chrom) + +gcoverage_impl(x, max_coords = c(1000, 500)) |> as.data.frame() + +*/ diff --git a/src/init.c b/src/init.c index 77796f5f..11fd9038 100644 --- a/src/init.c +++ b/src/init.c @@ -14,6 +14,7 @@ extern SEXP _valr_complement_impl(void *, void *); extern SEXP _valr_coverage_impl(void *, void *, void *, void *); extern SEXP _valr_dist_impl(void *, void *, void *, void *, void *); extern SEXP _valr_flank_impl(void *, void *, void *, void *, void *, void *, void *, void *); +extern SEXP _valr_gcoverage_impl(void *, void *); extern SEXP _valr_intersect_impl(void *, void *, void *, void *, void *, void *, void *); extern SEXP _valr_makewindows_impl(void *, void *, void *, void *, void *); extern SEXP _valr_merge_impl(void *, void *, void *); @@ -29,6 +30,7 @@ static const R_CallMethodDef CallEntries[] = { {"_valr_coverage_impl", (DL_FUNC) &_valr_coverage_impl, 4}, {"_valr_dist_impl", (DL_FUNC) &_valr_dist_impl, 5}, {"_valr_flank_impl", (DL_FUNC) &_valr_flank_impl, 8}, + {"_valr_gcoverage_impl", (DL_FUNC) &_valr_gcoverage_impl, 2}, {"_valr_intersect_impl", (DL_FUNC) &_valr_intersect_impl, 7}, {"_valr_makewindows_impl", (DL_FUNC) &_valr_makewindows_impl, 5}, {"_valr_merge_impl", (DL_FUNC) &_valr_merge_impl, 3}, diff --git a/tests/testthat/test_genomecov.R b/tests/testthat/test_genomecov.R new file mode 100644 index 00000000..19bf204e --- /dev/null +++ b/tests/testthat/test_genomecov.R @@ -0,0 +1,203 @@ +x <- tibble::tribble( + ~chrom, ~start, ~end, ~strand, + "chr1", 20, 70, "+", + "chr1", 50, 100, "-", + "chr1", 200, 250, "+", + "chr1", 220, 250, "+" +) + +genome <- tibble::tribble( + ~chrom, ~size, + "chr1", 500, + "chr2", 1000 +) +test_that("bed_genomecov works", { + ex <- tibble::tribble( + ~chrom, ~start, ~end, ~.depth, + "chr1", 20L, 50L, 1L, + "chr1", 50L, 70L, 2L, + "chr1", 70L, 100L, 1L, + "chr1", 200L, 220L, 1L, + "chr1", 220L, 250L, 2L + ) + obs <- bed_genomecov(x, genome) + expect_identical(obs, ex) + + res <- bed_genomecov(x, genome, zero_depth = TRUE) + expect_true(sum(res$.depth == 0) > 0) +}) + +test_that("groups are respected", { + ex <- tibble::tribble( + ~chrom, ~start, ~end, ~strand, ~.depth, + "chr1", 20L, 70L, "+", 1L, + "chr1", 200L, 220L, "+", 1L, + "chr1", 220L, 250L, "+", 2L, + "chr1", 50L, 100L, "-", 1L + ) + obs <- bed_genomecov(group_by(x, strand), genome) + expect_identical(obs, ex) + + res <- bed_genomecov(group_by(x, strand), genome, zero_depth = TRUE) + expect_equal(sum(res$chrom == "chr2"), 2L) +}) + +test_that("grouping is retained for zero depth intervals", { + xx <- tibble::tribble( + ~chrom, ~start, ~end, ~strand, ~grp, + "chr1", 20L, 70L, "+", 1, + "chr1", 200L, 220L, "-", 1, + "chr1", 20L, 70L, "+", 2, + "chr1", 200L, 220L, "-", 2 + ) |> + group_by(strand, grp) + + many_chroms_genome <- tibble( + chrom = c("chr1", LETTERS), + size = 500 + ) + + res <- bed_genomecov(xx, many_chroms_genome, zero_depth = TRUE) + expect_equal(length(setdiff(many_chroms_genome$chrom, res$chrom)), 0L) + + ex <- tibble::tribble( + ~strand, ~grp, ~n, + "+", 1, 26L, + "+", 2, 26L, + "-", 1, 26L, + "-", 2, 26L + ) + + lr <- res[res$chrom %in% LETTERS, ] + nlr <- lr |> + group_by(strand, grp) |> + summarize(n = n()) |> + ungroup() + expect_identical(nlr, ex) +}) + +test_that("chroms in bed, not in genome, are dropped", { + xx <- tibble::tribble( + ~chrom, ~start, ~end, ~strand, ~.depth, + "hello", 20L, 70L, "+", 1L, + "world", 200L, 220L, "+", 1L, + "chr1", 220L, 250L, "+", 2L + ) + expect_warning(res <- bed_genomecov(xx, genome)) + expect_true(all(res$chrom == "chr1")) +}) + +test_that("zero length input is handled", { + xx <- tibble::tribble( + ~chrom, ~start, ~end, ~strand, ~.depth, + "hello", 20L, 70L, "+", 1L, + ) + + expect_warning(res <- bed_genomecov(xx, genome)) + expect_true(nrow(res) == 0) + + xx <- xx[xx$chrom != "hello", ] + + res <- bed_genomecov(xx, genome) + expect_true(nrow(res) == 0) +}) + +test_that("check edge cases with 1 bp intervals", { + # base-level coverage equals number of basepairs in input intervals + genome <- tribble( + ~chrom, ~size, + "chr1", 1e5 + ) + seed <- 1010486 + ivls <- bed_random(genome, length = 1, n = 1e3, seed = seed) + + res <- bed_genomecov(ivls, genome) + expect_true(sum(res$.depth) == 1e3) + + ivls <- tibble( + chrom = "chr1", + start = seq(0, 999), + end = start + 2 + ) + res <- bed_genomecov(ivls, genome) + expect_true(sum(res$.depth) == (1e3 * 2)) + + set.seed(seed) + ivls <- tibble( + chrom = "chr1", + start = seq(0, 999), + end = start + sample(1:100, length(start), replace = TRUE) + ) + + res <- bed_genomecov(ivls, genome) + n_bp <- sum(ivls$end - ivls$start) + n_cov <- sum(res$.depth * (res$end - res$start)) + expect_equal(n_bp, n_cov) +}) + +test_that("check edge cases at beginning and end", { + genome <- tribble( + ~chrom, ~size, + "chr1", 1000 + ) + ex <- tibble::tribble( + ~chrom, ~start, ~end, ~.depth, + "chr1", 0L, 1L, 3L, + "chr1", 1L, 2L, 1L, + "chr1", 999L, 1000L, 1L + ) + + # oob intervals are ignored with a warning + ivls <- tibble( + chrom = "chr1", + start = c(rep(0, 3), 1, 999, 1000), + end = start + 1 + ) + + expect_warning(res <- bed_genomecov(ivls, genome)) + expect_true(all(res$start < 1000)) + + expect_identical(res, ex) +}) + +# bed related tests from #https://github.com/arq5x/bedtools2/blob/master/test/genomecov/test-genomecov.sh + +y <- tibble::tribble( + ~chrom, ~start, ~end, ~group, ~score, ~strand, + "1", 15L, 20L, "y1", 1L, "+", + "1", 17L, 22L, "y2", 2L, "+" +) + +genome <- tibble::tribble( + ~chrom, ~size, + "1", 100L, + "2", 100L, + "3", 100L +) + +test_that("Test with chroms that have no coverage", { + ex <- tibble::tribble( + ~chrom, ~start, ~end, ~.depth, + "1", 15L, 17L, 1L, + "1", 17L, 20L, 2L, + "1", 20L, 22L, 1L + ) + obs <- bed_genomecov(y, genome) + expect_equal(ex, obs) +}) + +test_that("Test with chroms that have no coverage", { + ex <- tibble::tribble( + ~chrom, ~start, ~end, ~.depth, + "1", 0L, 15L, 0L, + "1", 15L, 17L, 1L, + "1", 17L, 20L, 2L, + "1", 20L, 22L, 1L, + "1", 22L, 100L, 0L, + "2", 0L, 100L, 0L, + "3", 0L, 100L, 0L + ) + + obs <- bed_genomecov(y, genome, zero_depth = TRUE) + expect_equal(ex, obs) +}) diff --git a/vignettes/valr.Rmd b/vignettes/valr.Rmd index 88108ea4..ec88e4e0 100644 --- a/vignettes/valr.Rmd +++ b/vignettes/valr.Rmd @@ -295,14 +295,14 @@ ggplot( aes( x = .win_id, y = win_mean - ) - ) + + ) +) + geom_point() + geom_pointrange(sd_limits) + scale_x_continuous( labels = x_labels, breaks = x_breaks - ) + + ) + labs( x = "Position (bp from TSS)", y = "Signal",