From f40defe7504aad5e269df5879a52587d8bc13751 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Tue, 23 Jul 2024 08:56:54 -0400 Subject: [PATCH 01/27] feat: add testthat GHA workflow --- .github/workflows/tests.yaml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .github/workflows/tests.yaml diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml new file mode 100644 index 0000000..bb664b1 --- /dev/null +++ b/.github/workflows/tests.yaml @@ -0,0 +1,26 @@ +on: + push: + branches: [main, master, dev] + pull_request: + branches: [main, master, dev] + +name: Run tests + +jobs: + test: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v3 + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::testthat + needs: tests + - name: Run tests + run: | + testthat::test_dir("tests/testthat") + shell: Rscript {0} From e0d8e3ba6ea021940cc17368947b1304ab9dce62 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Tue, 23 Jul 2024 08:57:09 -0400 Subject: [PATCH 02/27] docs: add `cli` to imports --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index c3f11d7..e66a524 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,7 +26,8 @@ Imports: data.table (>= 1.12.2), glue, bit64, - crayon + crayon, + cli Suggests: knitr, rmarkdown, From e6b1c2632b032aca0f693a644acc91160e7f1d34 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Tue, 23 Jul 2024 08:58:26 -0400 Subject: [PATCH 03/27] feat: add `dbMatrix_from_tbl` --- TODO: - refactor `show` method --- R/dbMatrix.R | 318 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 253 insertions(+), 65 deletions(-) diff --git a/R/dbMatrix.R b/R/dbMatrix.R index 99c2612..4eadb1b 100644 --- a/R/dbMatrix.R +++ b/R/dbMatrix.R @@ -80,20 +80,39 @@ setMethod('show', signature(object = 'dbDenseMatrix'), function(object) { # get matrix i and j to print # p_coln = head(col_names, 10L) - p_coln = c(1:3, (dim_col-2):dim_col) + if(dim_col > 6) { + p_coln = c(1:3, (dim_col-2):dim_col) + } else { + p_coln = col_names + } + if(dim_row - 6L > 0L) { # p_rown = c(head(row_names, 3L), tail(row_names, 3L)) p_rown = c(1:3, (dim_row-2):dim_row) } else { - # p_rown = row_names - p_rown = 1:length(row_names) + p_rown = row_names + # p_rown = 1:length(row_names) } + filter_i = sapply(p_rown, function(f_i) which(f_i == row_names)) + filter_j = sapply(p_coln, function(f_j) which(f_j == col_names)) + # prepare subset to print preview_dt = object@value |> - dplyr::filter(i %in% p_rown & j %in% p_coln) |> - data.table::as.data.table() - data.table::setkeyv(preview_dt, c('i', 'j')) # enforce ordering + dplyr::filter(i %in% filter_i & j %in% filter_j) |> + dplyr::collect() + + # ij indices for printing + a_i = sapply(preview_dt$i, function(i_idx) which(row_names[i_idx] == p_rown)) + a_j = sapply(preview_dt$j, function(j_idx) which(col_names[j_idx] == p_coln)) + + if (length(a_i) == 0L) a_i = NULL + if (length(a_j) == 0L) a_j = NULL + + a_x <- NULL + if (length(preview_dt$x) != 0L) { # catch sparse case where if/else: null + a_x <- preview_dt$x + } if(nrow(preview_dt) > 0) { preview_dt = data.table::dcast(preview_dt, formula = i ~ j, value.var = 'x') @@ -101,7 +120,8 @@ setMethod('show', signature(object = 'dbDenseMatrix'), function(object) { print("") # TODO update this for sparse matrix } - if(nrow(preview_dt < 7L)) { + suppress_rows = FALSE + if(suppress_rows) { output = as.matrix(preview_dt[1:6,2:7]) rownames(output) = as.matrix(preview_dt[,1]) @@ -160,13 +180,13 @@ setMethod('show', signature(object = 'dbDenseMatrix'), function(object) { sep = " ", file = "") } else { - # data.table::setkey(preview_dt, NULL) - print(preview_dt[1L:3L,], digits = 5L, row.names = 'none') - - sprintf(' ........suppressing %d columns and %d rows\n', - dim_col - 10L, dim_row - 6L) - - print(preview_dt[4L:6L,], digits = 5L, row.names = 'none') + print_array( + i = a_i, + j = a_j, + x = a_x, + dims = c(length(p_rown), length(p_coln)), + rownames = p_rown + ) } cat('\n') @@ -264,7 +284,13 @@ setMethod('show', signature('dbSparseMatrix'), function(object) { writeLines(a_out[5:7]) } else { # no suppressed lines: Directly print - print_array(i = a_i, j = a_j, x = a_x, dims = c(length(p_rown), length(p_coln)), rownames = p_rown) + print_array( + i = a_i, + j = a_j, + x = a_x, + dims = c(length(p_rown), length(p_coln)), + rownames = p_rown + ) } @@ -292,7 +318,7 @@ setMethod('show', signature('dbSparseMatrix'), function(object) { #' @description #' Create an S4 \code{dbMatrix} object in sparse or dense triplet vector format. #' @param value data to be added to the database. See details for supported data types \code{(required)} -#' @param name table name to assign within database \code{(optional, default: "dbMatrix")} +#' @param name table name to assign within database \code{(required, default: "dbMatrix")} #' @param con DBI or duckdb connection object \code{(required)} #' @param overwrite whether to overwrite if table already exists in database \code{(required)} #' @param class class of the dbMatrix: \code{dbDenseMatrix} or \code{dbSparseMatrix} \code{(required)} @@ -329,17 +355,17 @@ setMethod('show', signature('dbSparseMatrix'), function(object) { #' overwrite = TRUE) #' dbSparse dbMatrix <- function(value, - class = NULL, - con = NULL, - overwrite = FALSE, - name = "dbMatrix", - dims = NULL, - dim_names = NULL, - mtx_rowname_file_path, - mtx_rowname_col_idx = 1, - mtx_colname_file_path, - mtx_colname_col_idx = 1, - ...) { + class = NULL, + con = NULL, + overwrite = FALSE, + name = "dbMatrix", + dims = NULL, + dim_names = NULL, + mtx_rowname_file_path, + mtx_rowname_col_idx = 1, + mtx_colname_file_path, + mtx_colname_col_idx = 1, + ...) { # check inputs .check_value(value) @@ -419,6 +445,7 @@ dbMatrix <- function(value, } # write ijx to db + # use duckdb::register instead of dplyr::copy_to ??? data <- dplyr::copy_to(dest = con, name = name, df = ijx, @@ -444,7 +471,7 @@ dbMatrix <- function(value, } else if(class == "dbDenseMatrix"){ set_class = "dbDenseMatrix" } else { ## redundant check from above - stopf("please specify dbMatrix class: 'dbDenseMatrix' or 'dbSparseMatrix'") + stopf("Please specify dbMatrix class: 'dbDenseMatrix' or 'dbSparseMatrix'") } res <- new(Class = set_class, @@ -566,7 +593,7 @@ toDbDense <- function(db_sparse){ data <- key |> dplyr::left_join(db_sparse[], by = c("i", "j"), suffix = c("", ".dgc")) |> - # dplyr::mutate(x = ifelse(is.na(x.dgc), x, x.dgc)) |> + dplyr::mutate(x = ifelse(is.na(x.dgc), x, x.dgc)) |> dplyr::select(-x.dgc) # data |> dplyr::compute(temporary = F) @@ -591,7 +618,6 @@ toDbDense <- function(db_sparse){ # dplyr::tbl(con, dplyr::sql(query)) |> # dplyr::compute(temporary=F) # tictoc::toc(); - # browser() # Create new dbSparseMatrix object db_dense <- new(Class = "dbDenseMatrix", @@ -603,7 +629,12 @@ toDbDense <- function(db_sparse){ } else{ # generate dbDenseMatrix from scratch - warning("Densifying dbSparseMatrix on the fly. See ?precompute to speed up densification.") + cli::cli_alert_info(paste( + 'Densifying "dbSparseMatrix" on the fly...', + "For large matrices, see ?precompute to speed up densification.", + sep = "\n", + collapse = "" + )) # to prevent 1e4 errors and allow >int32 n_rows = bit64::as.integer64(n_rows) @@ -622,7 +653,7 @@ toDbDense <- function(db_sparse){ data <- key |> dplyr::left_join(db_sparse[], by = c("i", "j"), suffix = c("", ".dgc")) |> - # dplyr::mutate(x = ifelse(is.na(x.dgc), x, x.dgc)) |> + dplyr::mutate(x = ifelse(is.na(x.dgc), x, x.dgc)) |> dplyr::select(-x.dgc) # Create new dbSparseMatrix object @@ -791,6 +822,173 @@ to_ijx_disk <- function(con, name){ return(res) } +#' dbMatrix_from_tbl +#' @description Construcst a \code{dbSparseMatrix} object from a \code{tbl_duckdb_connection} object. +#' @details +#' The \code{tbl_duckdb_connection} object must contain dimension names. +#' @param tbl \code{tbl_duckdb_connection} table in DuckDB database in long format +#' @param con DBI or duckdb connection object \code{(required)} +#' @param rownames_colName \code{character} column name of rownames in tbl \code{(required)} +#' @param colnames_colName \code{character} column name of colnames in tbl \code{(required)} +#' @param name table name to assign within database \code{(required, default: "dbMatrix")} +#' @param overwrite whether to overwrite if table already exists in database \code{(required)} +#' +#' @return dbMatrix object +#' @keywords internal +dbMatrix_from_tbl <- function(tbl, + rownames_colName, + colnames_colName, + name = "dbMatrix", + overwrite = FALSE){ + # Check args + # TODO: update to proper tbl object check + con = dbplyr::remote_con(tbl) + .check_con(con) + .check_tbl(tbl) + .check_name(name = name) + .check_overwrite( + conn = con, + name = name, + skip_value_check = TRUE, + overwrite = overwrite + ) + + if(is.null(rownames_colName) | is.null(colnames_colName)){ + stop("rownames_colName and colnames_colName must be provided") + } + + if(!all(c(rownames_colName, colnames_colName) %in% colnames(tbl))){ + stop("rownames_colName and colnames_colName must be present in tbl colnames") + } + + if(name %in% DBI::dbListTables(con) & !overwrite){ + stop("name already exists in the database. + Please choose a unique name.") + } + + # check if i and j are column names + check_names = intersect(c(colnames(tbl), as.character(rownames_colName), + as.character(colnames_colName)), + c("i", "j")) + if(length(check_names)>0) { + stop("i and j are reserved names for matrix dimensions. please choose + new column names") + } + + rownames_colName = rlang::sym(rownames_colName) + colnames_colName = rlang::sym(colnames_colName) + + # check for NA values in row/col names + n_na <- tbl |> + dplyr::filter(is.na(rownames_colName) | is.na(colnames_colName)) |> + dplyr::tally() |> + dplyr::pull(n) + + if(n_na > 0){ + stop("NA values found in rownames or colnames. Please remove NA values.") + } + + # summarize the number of counts for each gene per cell id + count_table <- tbl |> + dplyr::group_by(rownames_colName, colnames_colName) |> + dplyr::summarise(x = dplyr::n(), .groups = "drop") + + # add label encodings and get dimensions, dim names + i_encoded = rlang::sym(paste0(as.character(rownames_colName), "_encoded")) + j_encoded = rlang::sym(paste0(as.character(colnames_colName), "_encoded")) + + count_table <- count_table |> + dplyr::mutate(i_encoded := dplyr::dense_rank(rownames_colName)) |> + dbplyr::window_order(rownames_colName) + + row_names <- count_table |> + dplyr::distinct(rownames_colName) |> + dplyr::arrange(rownames_colName) |> + dplyr::pull(rownames_colName) + + dim_i <- as.integer(length(row_names)) + + count_table <- count_table |> + dplyr::mutate(j_encoded := dplyr::dense_rank(colnames_colName)) |> + dbplyr::window_order(colnames_colName) |> + dplyr::ungroup() + + col_names <- count_table |> + dplyr::distinct(colnames_colName) |> + dplyr::arrange(colnames_colName) |> + dplyr::pull(colnames_colName) + + dim_j <- as.integer(length(col_names)) + + ijx <- count_table |> + dplyr::select(i = i_encoded, j = j_encoded, x) |> + dplyr::compute(name = name, overwrite = overwrite) # save to db + + # set metadata + dims = c(dim_i, dim_j) + dim_names = list(row_names, col_names) + + res <- new(Class = "dbSparseMatrix", + value = ijx, + name = name, + init = TRUE, + dim_names = dim_names, + dims = dims) + + return(res) + + # # Create 'index' with distinct 'genes' and 'id' + # index <- count_table |> + # dplyr::distinct(rownames_colName, colnames_colName) + # + # # map unique integer to each gene and cellID for dbMatrix creation + # # Note: window_order is necessary to have reproducible j assignment + # tbl_i <- index |> + # dplyr::distinct(rownames_colName) |> + # dbplyr::window_order(rownames_colName) |> + # dplyr::mutate(i = dplyr::row_number()) + # + # tbl_j <- index |> + # dplyr::distinct(colnames_colName) |> + # dbplyr::window_order(colnames_colName) |> + # dplyr::mutate(j = dplyr::row_number()) + # + # ijx <- count_table |> + # dplyr::inner_join(tbl_i, by = as.character(rownames_colName)) |> + # dplyr::inner_join(tbl_j, by = as.character(colnames_colName)) |> + # dplyr::select(i, j, x) |> + # dplyr::compute(name = name, overwrite = overwrite) + + #dplyr::compute(temporary = FALSE, name = name, overwrite = TRUE) + + # Note: dim must be less than 2^31 int32 limit + # dim_i = tbl_i |> dplyr::tally() |> dplyr::pull(n) |> as.integer() + # dim_j = tbl_j |> dplyr::tally() |> dplyr::pull(n) |> as.integer() + # dims = c(dim_i, dim_j) + # + # # Note: factor in dbMatrix constructor + # row_names = tbl_i |> dplyr::pull(rownames_colName) + # col_names = tbl_j |> dplyr::pull(colnames_colName) + # dim_names = list(row_names, col_names) + + # Pass ijx to the dbMatrix constructor + # res <- dbMatrix::dbMatrix(value = ijx, + # class = "dbSparseMatrix", + # con = con, + # name = name, + # dims = dims, + # dim_names = dim_names, + # overwrite = overwrite) + # TODO: provide option for dbDenseMatrix + + # res <- new(Class = "dbSparseMatrix", + # value = ijx, + # name = name, + # init = TRUE, + # dim_names = dim_names, + # dims = dims) +} + # readers #### #' read_matrix #' @description Read tabular matrix files into database @@ -1131,6 +1329,7 @@ make_ijx_dimnames <- function(dbMatrix, overwrite = FALSE, colName_i, colName_j) { + # input validation .check_name(name) .check_name(colName_i) .check_name(colName_j) @@ -1144,44 +1343,33 @@ make_ijx_dimnames <- function(dbMatrix, ) dimnames <- dimnames(dbMatrix) - colName_i = rlang::sym(colName_i) - colName_j = rlang::sym(colName_j) + # map dimnames to indices in-memory + dt_rownames <- data.table::data.table(dimnames[[1]]) + data.table::setnames(dt_rownames, colName_i) + dt_rownames[, i := .I] - # add dimnames to database - dimnames1_tbl <- dplyr::copy_to( - con, - data.frame(i = seq_along(dimnames[[1]]), colName_i = dimnames[[1]]), - overwrite = TRUE, - name = "tmp_dimnames1", - temporary = TRUE - ) + dt_colnames <- data.table::data.table(dimnames[[2]]) + data.table::setnames(dt_colnames, colName_j) + dt_colnames[, j := .I] - dimnames2_tbl <- dplyr::copy_to( - con, - data.frame(j = seq_along(dimnames[[2]]), colName_j = dimnames[[2]]), - overwrite = TRUE, - name = "tmp_dimnames2", - temporary = TRUE - ) + # register map to db + duckdb::duckdb_register(con, "temp_rownames", dt_rownames, overwrite = TRUE) + duckdb::duckdb_register(con, "temp_colnames", dt_colnames, overwrite = TRUE) + + dimnames1_tbl <- dplyr::tbl(con, "temp_rownames") + dimnames2_tbl <- dplyr::tbl(con, "temp_colnames") - res <- dbMatrix@value |> + res <- dbMatrix[] |> dplyr::left_join(dimnames1_tbl, by = "i") |> dplyr::left_join(dimnames2_tbl, by = "j") |> - dplyr::select( - i, - !!colName_i := colName_i, # !! to unquote - j, - !!colName_j := colName_j, # !! to unquote - x - ) |> - dplyr::compute( - name = name, - temporary = TRUE, - overwrite = overwrite - ) - - DBI::dbRemoveTable(con, "tmp_dimnames1") - DBI::dbRemoveTable(con, "tmp_dimnames2") + dplyr::select(i, + !!colName_i := colName_i, # !! to unquote + j, + !!colName_j := colName_j, # !! to unquote + x) + + # DBI::dbExecute(con, glue::glue("DROP VIEW IF EXISTS temp_rownames")) + # DBI::dbExecute(con, glue::glue("DROP VIEW IF EXISTS temp_colnames")) return(res) From ba346db05dde5551849416387ee58e310835936d Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Tue, 23 Jul 2024 08:59:48 -0400 Subject: [PATCH 04/27] fix: update `extract.R` and `names.R` --- Remove dbData from methods for dbMatrix as temporary placeholder before migrating to dbData. TODO: move files to dbData and remove from dbMatrix --- R/extract.R | 12 ++++++------ R/names.R | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/extract.R b/R/extract.R index 5cd9058..acfbfb9 100644 --- a/R/extract.R +++ b/R/extract.R @@ -3,7 +3,7 @@ ### Extract [] #### #' @rdname hidden_aliases #' @export -setMethod('[', signature(x = 'dbData', i = 'missing', j = 'missing', drop = 'missing'), +setMethod('[', signature(x = 'dbMatrix', i = 'missing', j = 'missing', drop = 'missing'), function(x, i, j) { x@value }) @@ -12,7 +12,7 @@ setMethod('[', signature(x = 'dbData', i = 'missing', j = 'missing', drop = 'mis # no initialize to prevent slowdown #' @rdname hidden_aliases #' @export -setMethod('[<-', signature(x = 'dbData', i = 'missing', j = 'missing', value = 'ANY'), +setMethod('[<-', signature(x = 'dbMatrix', i = 'missing', j = 'missing', value = 'ANY'), function(x, i, j, value) { x@value = value x @@ -45,7 +45,7 @@ setMethod('[', signature(x = 'dbMatrix', i = 'dbIndex', j = 'missing', drop = 'm temporary = TRUE) # subset dbMatrix - x@value <- x@value |> + x[] <- x[] |> dplyr::filter(i %in% !!map$i) |> dplyr::inner_join(map_temp, by = c("i" = "i")) |> dplyr::select(i = new_i, j, x) @@ -84,7 +84,7 @@ setMethod('[', signature(x = 'dbMatrix', i = 'missing', j = 'dbIndex', drop = 'm map_temp <- dplyr::tbl(con, "map_temp_j") # subset dbMatrix - x@value <- x@value |> + x[] <- x[] |> dplyr::filter(j %in% !!map$j) |> dplyr::inner_join(map_temp, by = c("j" = "j")) |> dplyr::select(i, j = new_j, x) @@ -134,7 +134,7 @@ setMethod('[', signature(x = 'dbMatrix', i = 'dbIndex', j = 'dbIndex', drop = 'm temporary = TRUE) map_temp <- dplyr::tbl(con, "map_temp_ij_i") - x@value <- x@value |> + x[] <- x[] |> dplyr::filter(j %in% !!map_j$j) |> dplyr::inner_join(map_temp, by = c("j" = "j")) |> dplyr::select(i, j = new_j, x) @@ -149,7 +149,7 @@ setMethod('[', signature(x = 'dbMatrix', i = 'dbIndex', j = 'dbIndex', drop = 'm map_temp <- dplyr::tbl(con, "map_temp_ij_j") - x@value <- x@value |> + x[] <- x[] |> dplyr::filter(i %in% !!map_i$i) |> dplyr::inner_join(map_temp, by = c("i" = "i")) |> dplyr::select(i=new_i, j , x) diff --git a/R/names.R b/R/names.R index b27fad3..44a910a 100644 --- a/R/names.R +++ b/R/names.R @@ -18,7 +18,7 @@ setMethod('names<-', signature(x = 'dbDataFrame', value = 'dbIndex'), function(x # rownames #### #' @rdname hidden_aliases #' @export -setMethod('rownames', signature(x = 'dbData'), function(x) { +setMethod('rownames', signature(x = 'dbMatrix'), function(x) { rownames(x@value) }) @@ -41,7 +41,7 @@ setMethod('rownames<-', signature(x = 'dbMatrix'), function(x, value) { # colnames #### #' @rdname hidden_aliases #' @export -setMethod('colnames', signature(x = 'dbData'), function(x) { +setMethod('colnames', signature(x = 'dbMatrix'), function(x) { colnames(x@value) }) From 5930cddab61e76588cade6b4db519f0e6acecd38 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Tue, 23 Jul 2024 09:00:24 -0400 Subject: [PATCH 05/27] refactor: `operations.R` and `simulate.R` --- R/operations.R | 53 +++++++++++++++----------------------------------- R/simulate.R | 3 ++- 2 files changed, 18 insertions(+), 38 deletions(-) diff --git a/R/operations.R b/R/operations.R index 932038d..5f30f53 100644 --- a/R/operations.R +++ b/R/operations.R @@ -58,20 +58,6 @@ arith_call_dbm_vect_multi = function(dbm, num_vect, generic_char, ordered_args) dbm[] = eval(str2lang(build_call)) - # } else { # if on disk use faster duckdb cli - # # close pool connection - # DBI::dbDisconnect(conn, shutdown = TRUE) - # - # # construct query - # query <- paste0("UPDATE ", remote_name ," SET x = x ", generic_char, " ", num_vect, ";") - # - # # send query to duckdb cli for faster processing - # system(paste("duckdb", db_path, shQuote(query))) - # } - - # return to pool connector - # cPool(dbm) = p - # show return(dbm) } @@ -245,8 +231,7 @@ setMethod('rowSums', signature(x = 'dbDenseMatrix'), #' @rdname hidden_aliases #' @export setMethod('rowSums', signature(x = 'dbSparseMatrix'), - function(x, ...) - { + function(x, ...){ x = castNumeric(x) # calc rowsum for nonzero values in ijx @@ -314,8 +299,7 @@ setMethod('colSums', signature(x = 'dbDenseMatrix'), #' @rdname hidden_aliases #' @export setMethod('colSums', signature(x = 'dbSparseMatrix'), - function(x, ...) - { + function(x, ...){ x = castNumeric(x) # calc colsum for nonzero values in ijx @@ -360,8 +344,7 @@ setMethod('colSums', signature(x = 'dbSparseMatrix'), #' @rdname hidden_aliases #' @export setMethod('rowMeans', signature(x = 'dbDenseMatrix'), - function(x, ...) - { + function(x, ...){ x = castNumeric(x) val_names = rownames(x) @@ -385,8 +368,7 @@ setMethod('rowMeans', signature(x = 'dbDenseMatrix'), #' @rdname hidden_aliases #' @export setMethod('rowMeans', signature(x = 'dbSparseMatrix'), - function(x, ...) - { + function(x, ...){ x = castNumeric(x) # get non-zero row idx (factors) and convert to integers @@ -414,18 +396,20 @@ setMethod('rowMeans', signature(x = 'dbSparseMatrix'), #' @rdname hidden_aliases #' @export setMethod('colMeans', signature(x = 'dbDenseMatrix'), - function(x, ...) - { + function(x, ...){ x = castNumeric(x) val_names = colnames(x) + vals = x[] |> dplyr::group_by(j) |> dplyr::summarise(mean_x = mean(x, na.rm = TRUE)) |> dplyr::arrange(j) |> dplyr::collapse() |> dplyr::pull(mean_x) + names(vals) = val_names + vals }) @@ -433,8 +417,7 @@ setMethod('colMeans', signature(x = 'dbDenseMatrix'), #' @rdname hidden_aliases #' @export setMethod('colMeans', signature(x = 'dbSparseMatrix'), - function(x, ...) - { + function(x, ...){ x = castNumeric(x) # get non-zero column idx (factors) and convert to integers @@ -463,8 +446,7 @@ setMethod('colMeans', signature(x = 'dbSparseMatrix'), #' @rdname hidden_aliases #' @export setMethod('colSds', signature(x = 'dbDenseMatrix'), - function(x, ...) - { + function(x, ...){ # x = reconnect(x) x = castNumeric(x) @@ -483,8 +465,7 @@ setMethod('colSds', signature(x = 'dbDenseMatrix'), #' @rdname hidden_aliases #' @export setMethod('colSds', signature(x = 'dbSparseMatrix'), - function(x, ...) - { + function(x, ...){ # x = reconnect(x) x = castNumeric(x) @@ -511,8 +492,7 @@ setMethod('colSds', signature(x = 'dbSparseMatrix'), #' @rdname hidden_aliases #' @export setMethod('rowSds', signature(x = 'dbDenseMatrix'), - function(x, ...) - { + function(x, ...){ # x = reconnect(x) x = castNumeric(x) @@ -531,8 +511,7 @@ setMethod('rowSds', signature(x = 'dbDenseMatrix'), #' @rdname hidden_aliases #' @export setMethod('rowSds', signature(x = 'dbSparseMatrix'), - function(x, ...) - { + function(x, ...){ # x = reconnect(x) x = castNumeric(x) @@ -626,7 +605,7 @@ setMethod('nrow', signature(x = 'dbMatrix'), function(x) { # x = reconnect(x) if (is.na(x@dims[1L])) { - conn = pool::localCheckout(cPool(x)) + conn = get_con(x) res = DBI::dbGetQuery(conn = conn, sprintf('SELECT DISTINCT i from %s', remoteName(x))) } else { @@ -653,7 +632,7 @@ setMethod('ncol', signature(x = 'dbMatrix'), function(x) { # x = reconnect(x) if (is.na(x@dims[2L])) { - conn = pool::localCheckout(cPool(x)) + conn = get_con(x) res = DBI::dbGetQuery(conn = conn, sprintf('SELECT DISTINCT j from %s', remoteName(x))) } else { @@ -742,7 +721,7 @@ setMethod('tail', signature(x = 'dbDataFrame'), function(x, n = 6L, ...) { #' @param ... additional params to pass #' @export setMethod('colTypes', signature(x = 'dbData'), function(x, ...) { - vapply(data.table::as.data.table(head(x[], 1L)), typeof, character(1L)) + vapply(data.table::as.data.table(head(slot(x, "value"), 1L)), typeof, character(1L)) }) ## castNumeric #### diff --git a/R/simulate.R b/R/simulate.R index 02bd592..42fea61 100644 --- a/R/simulate.R +++ b/R/simulate.R @@ -4,7 +4,8 @@ sim_duckdb = function(value = datasets::iris, name = 'test', con = NULL, memory = TRUE) { - # setup in-memory db if no pool connection provided + + # setup in-memory db if(is.null(con)) { if(memory){ drv = duckdb::duckdb(dbdir = ':memory:') From 7c73aad58a161013462e716ee15900c8152bd224 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Tue, 23 Jul 2024 09:00:33 -0400 Subject: [PATCH 06/27] refactor: `as_matrix` --- R/utils.R | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/R/utils.R b/R/utils.R index e4e5a17..adcd8b2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -142,17 +142,24 @@ as_matrix <- function(x){ stop("Invalid input. Only dbMatrix is currently supported.") } - check_class = class(x) - - # Get dbMatrix in triplet vector format (TSparseMatrix) - df = x@value |> as.data.frame() + check_class <- class(x) + dims <- dim(x) + dim_names <- dimnames(x) + + # convert db table into in-memory dt + if (dims[1] > 1e5 || dims[2] > 1e5){ + cli::cli_alert_warning( + "Warning: Converting large dbMatrix to in-memory Matrix.") + } - dims = dim(x) - dim_names = dimnames(x) + dt <- data.table::CJ(i = 1:dims[1], j = 1:dims[2], x= 0) + dt2 <- data.table::as.data.table(x@value) + dt[dt2, on = .(i, j), x := i.x] # Create mat # Note: casting to sparseMatrix automatically converts to 0-based indexing - mat = Matrix::sparseMatrix(i = df$i , j = df$j , x = df$x) + mat <- Matrix::sparseMatrix(i = dt$i , j = dt$j , x = dt$x) + mat <- Matrix::drop0(mat) dimnames(mat) = dim_names dim(mat) = dims From 7905eef9191e5b3be527ae5c0953654f28289213 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Tue, 23 Jul 2024 09:00:42 -0400 Subject: [PATCH 07/27] docs: update --- man/dbMatrix.Rd | 2 +- man/dbMatrix_from_tbl.Rd | 37 +++++++++++++++++++++++++++++++++++++ man/hidden_aliases.Rd | 16 +++++++--------- 3 files changed, 45 insertions(+), 10 deletions(-) create mode 100644 man/dbMatrix_from_tbl.Rd diff --git a/man/dbMatrix.Rd b/man/dbMatrix.Rd index 07489bb..2b03de7 100644 --- a/man/dbMatrix.Rd +++ b/man/dbMatrix.Rd @@ -43,7 +43,7 @@ dbMatrix( \item{overwrite}{whether to overwrite if table already exists in database \code{(required)}} -\item{name}{table name to assign within database \code{(optional, default: "dbMatrix")}} +\item{name}{table name to assign within database \code{(required, default: "dbMatrix")}} \item{dims}{dimensions of the matrix \code{(optional: [int, int])}} diff --git a/man/dbMatrix_from_tbl.Rd b/man/dbMatrix_from_tbl.Rd new file mode 100644 index 0000000..fb004fc --- /dev/null +++ b/man/dbMatrix_from_tbl.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dbMatrix.R +\name{dbMatrix_from_tbl} +\alias{dbMatrix_from_tbl} +\title{dbMatrix_from_tbl} +\usage{ +dbMatrix_from_tbl( + tbl, + rownames_colName, + colnames_colName, + name = "dbMatrix", + overwrite = FALSE +) +} +\arguments{ +\item{tbl}{\code{tbl_duckdb_connection} table in DuckDB database in long format} + +\item{rownames_colName}{\code{character} column name of rownames in tbl \code{(required)}} + +\item{colnames_colName}{\code{character} column name of colnames in tbl \code{(required)}} + +\item{name}{table name to assign within database \code{(required, default: "dbMatrix")}} + +\item{overwrite}{whether to overwrite if table already exists in database \code{(required)}} + +\item{con}{DBI or duckdb connection object \code{(required)}} +} +\value{ +dbMatrix object +} +\description{ +Construcst a \code{dbSparseMatrix} object from a \code{tbl_duckdb_connection} object. +} +\details{ +The \code{tbl_duckdb_connection} object must contain dimension names. +} +\keyword{internal} diff --git a/man/hidden_aliases.Rd b/man/hidden_aliases.Rd index 458df4b..a9bbf5e 100644 --- a/man/hidden_aliases.Rd +++ b/man/hidden_aliases.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/extract.R, R/names.R, R/operations.R -\name{[,dbData,missing,missing,missing-method} -\alias{[,dbData,missing,missing,missing-method} -\alias{[<-,dbData,missing,missing,ANY-method} +\name{[,dbMatrix,missing,missing,missing-method} +\alias{[,dbMatrix,missing,missing,missing-method} +\alias{[<-,dbMatrix,missing,missing,ANY-method} \alias{[,dbMatrix,dbIndex,missing,missing-method} \alias{[,dbMatrix,missing,dbIndex,missing-method} \alias{[,dbMatrix,dbIndex,dbIndex,missing-method} @@ -13,10 +13,8 @@ \alias{[<-,dbDataFrame,missing,missing,ANY-method} \alias{names,dbDataFrame-method} \alias{names<-,dbDataFrame,dbIndex-method} -\alias{rownames,dbData-method} \alias{rownames,dbMatrix-method} \alias{rownames<-,dbMatrix-method} -\alias{colnames,dbData-method} \alias{colnames,dbMatrix-method} \alias{colnames<-,dbMatrix,ANY-method} \alias{colnames<-,dbDataFrame,dbIndex-method} @@ -52,9 +50,9 @@ \alias{dim,dbMatrix-method} \title{rowSums} \usage{ -\S4method{[}{dbData,missing,missing,missing}(x, i, j) +\S4method{[}{dbMatrix,missing,missing,missing}(x, i, j) -\S4method{[}{dbData,missing,missing,ANY}(x, i, j) <- value +\S4method{[}{dbMatrix,missing,missing,ANY}(x, i, j) <- value \S4method{[}{dbMatrix,dbIndex,missing,missing}(x, i, j, ..., drop = TRUE) @@ -76,13 +74,13 @@ \S4method{names}{dbDataFrame,dbIndex}(x) <- value -\S4method{rownames}{dbData}(x) +\S4method{rownames}{dbMatrix}(x) \S4method{rownames}{dbMatrix}(x) \S4method{rownames}{dbMatrix}(x) <- value -\S4method{colnames}{dbData}(x) +\S4method{colnames}{dbMatrix}(x) \S4method{colnames}{dbMatrix}(x) From 06a37ee746aab49570cd27164078fd6d0d409f4d Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 31 Jul 2024 00:16:53 -0400 Subject: [PATCH 08/27] refactor: matrix summary functions --- avoids pulling into memory duplicate values and instead finds distinct values in db --- R/operations.R | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/R/operations.R b/R/operations.R index 5f30f53..ad31e16 100644 --- a/R/operations.R +++ b/R/operations.R @@ -243,9 +243,9 @@ setMethod('rowSums', signature(x = 'dbSparseMatrix'), # get row_idx for non-zero values in ijx nonzero_row_indices = x[] |> + dplyr::distinct(i) |> dplyr::arrange(i) |> - dplyr::pull(i) |> - unique() + dplyr::pull(i) # format data for join operation nonzero_rownames = rownames(x)[nonzero_row_indices] @@ -311,9 +311,9 @@ setMethod('colSums', signature(x = 'dbSparseMatrix'), # get col_idx for non-zero values in ijx nonzero_col_indices = x[] |> + dplyr::distinct(j) |> dplyr::arrange(j) |> - dplyr::pull(j) |> - unique() + dplyr::pull(j) # format data for join operation nonzero_colnames = colnames(x)[nonzero_col_indices] @@ -373,9 +373,9 @@ setMethod('rowMeans', signature(x = 'dbSparseMatrix'), # get non-zero row idx (factors) and convert to integers row_indices = x[] |> + dplyr::distinct(i) |> dplyr::arrange(i) |> dplyr::pull(i) |> - unique() |> as.integer() # get non-zero row names by row idx @@ -422,11 +422,10 @@ setMethod('colMeans', signature(x = 'dbSparseMatrix'), # get non-zero column idx (factors) and convert to integers col_indices = x[] |> + dplyr::distinct(j) |> dplyr::arrange(j) |> dplyr::pull(j) |> - unique() |> - as.integer() |> - sort() + as.integer() # get non-zero column names by column idx val_names = factor(colnames(x)[col_indices]) From be9d6030dbbe0149b73a2f20af3642635c44acb2 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 31 Jul 2024 00:17:02 -0400 Subject: [PATCH 09/27] chore: update imports --- DESCRIPTION | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index e66a524..5a3e306 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,7 +27,9 @@ Imports: glue, bit64, crayon, - cli + cli, + R.utils, + checkmate Suggests: knitr, rmarkdown, From f09e179855ce4bdb7d8adc25bc3fdfa2b9148e6d Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Sun, 4 Aug 2024 10:17:46 -0400 Subject: [PATCH 10/27] refactor: `as_matrix()` --- - update params in map_ijx_dimnames --- R/dbMatrix.R | 107 ++++++++++++++++++++++++++++++++++++++++++--------- R/utils.R | 66 +------------------------------ 2 files changed, 90 insertions(+), 83 deletions(-) diff --git a/R/dbMatrix.R b/R/dbMatrix.R index 4eadb1b..a5346af 100644 --- a/R/dbMatrix.R +++ b/R/dbMatrix.R @@ -366,7 +366,6 @@ dbMatrix <- function(value, mtx_colname_file_path, mtx_colname_col_idx = 1, ...) { - # check inputs .check_value(value) .check_con(con) @@ -822,6 +821,89 @@ to_ijx_disk <- function(con, name){ return(res) } +#' as_matrix +#' +#' @param x dbSparseMatrix +#' @details +#' this is a helper function to convert dbMatrix to dgCMatrix or matrix +#' Warning: this fn can lead memory issue if the dbMatrix is large +#' +#' +#' @return dgCMatrix or matrix +#' @noRd +as_matrix <- function(x){ + # check that x is a dbSparseMatrix + if(!inherits(x = x, what = "dbSparseMatrix")){ + stop("Invalid input. Only dbSparseMatrix is currently supported.") + } + con <- dbplyr::remote_con(x[]) + .check_con(con) + dims <- dim(x) + n_rows <- dims[1] + n_cols <- dims[2] + dim_names <- dimnames(x) + + # convert db table into in-memory dt + if (dims[1] > 1e5 || dims[2] > 1e5){ + cli::cli_alert_warning( + "Warning: Converting large dbMatrix to in-memory Matrix.") + } + + max_i <- x[] |> dplyr::summarise(max_i = max(i)) |> dplyr::pull(max_i) + max_j <- x[] |> dplyr::summarise(max_j = max(j)) |> dplyr::pull(max_j) + + temp_file <- tempfile(tmpdir = getwd(), fileext = ".parquet") + + if (max_i == n_rows & max_j == n_cols){ + x[] |> arrow::to_arrow() |> arrow::write_parquet(temp_file) + } else { + # Generate i and j vectors from scratch + sql_i <- glue::glue("SELECT i FROM generate_series(1, {n_rows}) AS t(i)") + sequence_i <- dplyr::tbl(con, dplyr::sql(sql_i)) + + sql_j <- glue::glue("SELECT j FROM generate_series(1, {n_cols}) AS t(j)") + sequence_j <- dplyr::tbl(con, dplyr::sql(sql_j)) + + key <- sequence_i |> + dplyr::cross_join(sequence_j) |> + dplyr::mutate(x = 0) + + key |> + dplyr::left_join(x[], by = c("i", "j"), suffix = c("", ".dgc")) |> + dplyr::mutate(x = ifelse(is.na(x.dgc), x, x.dgc)) |> + dplyr::select(-x.dgc) |> + arrow::to_arrow() |> + arrow::write_parquet(temp_file) + } + + # Create mat + dt <- arrow::read_parquet(temp_file) + mat <- Matrix::sparseMatrix(i = dt$i , j = dt$j , x = dt$x, index1 = TRUE) + mat <- Matrix::drop0(mat) + dimnames(mat) = dim_names + dim(mat) = dims + unlink(temp_file) + + return(mat) +} + +#' @title as_ijx +#' @param x dgCMatrix or matrix +#' @noRd +as_ijx <- function(x){ + # check that x is a dgCMatrix or matrix + stopifnot(is(x, "dgCMatrix") || is(x, "matrix")) + + # Convert dgc into TsparseMatrix class from {Matrix} + ijx <- as(x, "TsparseMatrix") + + # Get dbMatrix in triplet vector format (TSparseMatrix) + # Convert to 1-based indexing + df = data.frame(i = ijx@i + 1, j = ijx@j + 1, x = ijx@x) + + return(df) +} + #' dbMatrix_from_tbl #' @description Construcst a \code{dbSparseMatrix} object from a \code{tbl_duckdb_connection} object. #' @details @@ -1308,7 +1390,7 @@ save <- function(dbMatrix, name = '', overwrite = FALSE, ...){ } # dimnames #### -#' Create database table with ijx and dimnames +#' Map dimnames to i,j indices #' @details #' Constructs a table in a database that contains the accompanying dimnames #' for a dbMatrix. The resulting columns in the table: @@ -1318,29 +1400,19 @@ save <- function(dbMatrix, name = '', overwrite = FALSE, ...){ #' * j_names (colnames) #' * x (counts of i,j occcurences) #' @param dbMatrix dbMatrix object -#' @param name name of table to add to database #' @param colName_i name of column rownames to add to database #' @param colName_j name of column colnames to add to database -#' @param overwrite whether to overwrite if table already exists in database #' default: 'FALSE'.' #' @keywords internal -make_ijx_dimnames <- function(dbMatrix, - name, - overwrite = FALSE, - colName_i, - colName_j) { +map_ijx_dimnames <- function(dbMatrix, + colName_i, + colName_j) { # input validation - .check_name(name) .check_name(colName_i) .check_name(colName_j) con <- get_con(dbMatrix) .check_con(con) - .check_overwrite( - conn = con, - overwrite = overwrite, - name = name, - skip_value_check = TRUE - ) + dimnames <- dimnames(dbMatrix) # map dimnames to indices in-memory @@ -1368,9 +1440,6 @@ make_ijx_dimnames <- function(dbMatrix, !!colName_j := colName_j, # !! to unquote x) - # DBI::dbExecute(con, glue::glue("DROP VIEW IF EXISTS temp_rownames")) - # DBI::dbExecute(con, glue::glue("DROP VIEW IF EXISTS temp_colnames")) - return(res) } diff --git a/R/utils.R b/R/utils.R index adcd8b2..64decad 100644 --- a/R/utils.R +++ b/R/utils.R @@ -30,9 +30,9 @@ wrap_txt = function(..., sep = ' ', strWidth = 100, errWidth = FALSE) { cat(..., sep = sep) |> capture.output() |> - strwrap(., prefix = ' ', initial = '', # indent later lines, no indent first line + strwrap(prefix = ' ', initial = '', # indent later lines, no indent first line width = min(80, getOption("width"), strWidth)) |> - paste(., collapse = '\n') + paste(collapse = '\n') } @@ -124,65 +124,3 @@ setMethod('dbListTables', signature(x = 'dbMatrix'), con <- get_con(x) DBI::dbListTables(conn = con) }) - -# Converters #### -#' as_matrix -#' -#' @param x dbSparseMatrix -#' @details -#' this is a helper function to convert dbMatrix to dgCMatrix or matrix -#' Warning: this fn can lead memory issue if the dbMatrix is large -#' -#' -#' @return dgCMatrix or matrix -#' @noRd -as_matrix <- function(x){ - # check that x is a dbSparseMatrix - if(!inherits(x = x, what = "dbMatrix")){ - stop("Invalid input. Only dbMatrix is currently supported.") - } - - check_class <- class(x) - dims <- dim(x) - dim_names <- dimnames(x) - - # convert db table into in-memory dt - if (dims[1] > 1e5 || dims[2] > 1e5){ - cli::cli_alert_warning( - "Warning: Converting large dbMatrix to in-memory Matrix.") - } - - dt <- data.table::CJ(i = 1:dims[1], j = 1:dims[2], x= 0) - dt2 <- data.table::as.data.table(x@value) - dt[dt2, on = .(i, j), x := i.x] - - # Create mat - # Note: casting to sparseMatrix automatically converts to 0-based indexing - mat <- Matrix::sparseMatrix(i = dt$i , j = dt$j , x = dt$x) - mat <- Matrix::drop0(mat) - dimnames(mat) = dim_names - dim(mat) = dims - - if(check_class == "dbSparseMatrix"){ - return(mat) - } else { - return(as.matrix(mat)) - } -} - -#' @title as_ijx -#' @param x dgCMatrix or matrix -#' @noRd -as_ijx <- function(x){ - # check that x is a dgCMatrix or matrix - stopifnot(is(x, "dgCMatrix") || is(x, "matrix")) - - # Convert dgc into TsparseMatrix class from {Matrix} - ijx <- as(x, "TsparseMatrix") - - # Get dbMatrix in triplet vector format (TSparseMatrix) - # Convert to 1-based indexing - df = data.frame(i = ijx@i + 1, j = ijx@j + 1, x = ijx@x) - - return(df) -} From 83b6fb6d5336913c97331887c462a0a6c71a050c Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Sun, 4 Aug 2024 10:18:39 -0400 Subject: [PATCH 11/27] refactor: `map_ijx_dimnames` --- ...ake_ijx_dimnames.Rd => map_ijx_dimnames.Rd} | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) rename man/{make_ijx_dimnames.Rd => map_ijx_dimnames.Rd} (55%) diff --git a/man/make_ijx_dimnames.Rd b/man/map_ijx_dimnames.Rd similarity index 55% rename from man/make_ijx_dimnames.Rd rename to man/map_ijx_dimnames.Rd index 54c5275..7a68e8d 100644 --- a/man/make_ijx_dimnames.Rd +++ b/man/map_ijx_dimnames.Rd @@ -1,25 +1,21 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dbMatrix.R -\name{make_ijx_dimnames} -\alias{make_ijx_dimnames} -\title{Create database table with ijx and dimnames} +\name{map_ijx_dimnames} +\alias{map_ijx_dimnames} +\title{Map dimnames to i,j indices} \usage{ -make_ijx_dimnames(dbMatrix, name, overwrite = FALSE, colName_i, colName_j) +map_ijx_dimnames(dbMatrix, colName_i, colName_j) } \arguments{ \item{dbMatrix}{dbMatrix object} -\item{name}{name of table to add to database} - -\item{overwrite}{whether to overwrite if table already exists in database -default: 'FALSE'.'} - \item{colName_i}{name of column rownames to add to database} -\item{colName_j}{name of column colnames to add to database} +\item{colName_j}{name of column colnames to add to database +default: 'FALSE'.'} } \description{ -Create database table with ijx and dimnames +Map dimnames to i,j indices } \details{ Constructs a table in a database that contains the accompanying dimnames From 22861d2a2033cc056fffd9673936112e2e28bd90 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Sun, 4 Aug 2024 13:40:25 -0400 Subject: [PATCH 12/27] chore: remove old classes --- NAMESPACE | 5 -- R/classes.R | 47 ------------- R/extract.R | 90 +------------------------ R/names.R | 37 +--------- R/operations.R | 38 ----------- R/simulate.R | 14 ---- man/dbDataFrame-class.Rd | 23 ------- man/flex_window_order.Rd | 17 ----- man/head-dbDataFrame-method.Rd | 11 --- man/hidden_aliases.Rd | 44 ------------ man/simulate_objects.Rd | 5 -- man/tail-dbDataFrame-method.Rd | 11 --- stache_tests/testthat/test-connection.R | 43 ------------ stache_tests/testthat/test-extract.R | 12 ---- stache_tests/testthat/test-query.R | 27 -------- stache_tests/testthat/test-spatchunk.R | 23 ------- 16 files changed, 2 insertions(+), 445 deletions(-) delete mode 100644 man/dbDataFrame-class.Rd delete mode 100644 man/flex_window_order.Rd delete mode 100644 man/head-dbDataFrame-method.Rd delete mode 100644 man/tail-dbDataFrame-method.Rd delete mode 100644 stache_tests/testthat/test-connection.R delete mode 100644 stache_tests/testthat/test-extract.R delete mode 100644 stache_tests/testthat/test-query.R delete mode 100644 stache_tests/testthat/test-spatchunk.R diff --git a/NAMESPACE b/NAMESPACE index 659638a..51e68bc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,23 +1,19 @@ # Generated by roxygen2: do not edit by hand -export(dbDataFrame) export(dbDenseMatrix) export(dbMatrix) export(dbSparseMatrix) export(precompute) export(readMM) export(read_matrix) -export(sim_dbDataFrame) export(sim_dbDenseMatrix) export(sim_dbSparseMatrix) -exportClasses(dbDataFrame) exportClasses(dbDenseMatrix) exportClasses(dbSparseMatrix) exportMethods("[") exportMethods("[<-") exportMethods("colnames<-") exportMethods("dimnames<-") -exportMethods("names<-") exportMethods("rownames<-") exportMethods(Arith) exportMethods(Ops) @@ -34,7 +30,6 @@ exportMethods(dimnames) exportMethods(head) exportMethods(log) exportMethods(mean) -exportMethods(names) exportMethods(ncol) exportMethods(nrow) exportMethods(rowMeans) diff --git a/R/classes.R b/R/classes.R index 0c26f87..219fe47 100644 --- a/R/classes.R +++ b/R/classes.R @@ -69,29 +69,6 @@ dbSparseMatrix = setClass( contains = "dbMatrix" ) -### dbDataFrame #### - - -#' @title S4 dbDataFrame class -#' @description -#' Representation of dataframes using an on-disk database. Each object -#' is used as a connection to a single table that exists within the database. -#' @slot data dplyr tbl that represents the database data -#' @slot hash unique hash ID for backend -#' @slot remote_name name of table within database that contains the data -#' @slot key column to set as key for ordering and subsetting on i -#' @export -dbDataFrame = setClass( - Class = 'dbDataFrame', - contains = 'dbData', - slots = list( - key = 'character' - ), - prototype = list( - key = NA_character_ - ) -) - ## dbIndex #### #' @title Virtual Class "dbIndex" - Simple Class for dbData indices #' @name dbIndex @@ -104,27 +81,3 @@ dbDataFrame = setClass( #' @noRd setClassUnion(name = 'dbIndex', members = c('logical', 'numeric', 'integer', 'character')) - -#' @title Virtual Class "dbIndexNonChar" - Simple Class for dbData indices -#' @name dbIndex -#' @description -#' This is a virtual class used for indices (in signatures) for indexing -#' and sub-assignment of 'dbData' objects. Simple class union of 'logical' and -#' 'numeric'. -#' Based on the 'index' class implemented in \pkg{Matrix} -#' @keywords internal -#' @noRd -setClassUnion(name = 'dbIndexNonChar', - members = c('logical', 'numeric')) - -## dbMF #### -#' @title Virtual Class "dbMFData" - Simple class for dbMatrix and dbDF -#' @name dbMFData -#' @description -#' This is a virtual class used to refer to dbMatrix and dbDataFrame objects as -#' a single signature. -#' @keywords internal -#' @noRd -setClassUnion(name = 'dbMFData', - members = c('dbMatrix', 'dbDataFrame')) - diff --git a/R/extract.R b/R/extract.R index acfbfb9..7ab68a2 100644 --- a/R/extract.R +++ b/R/extract.R @@ -175,92 +175,4 @@ get_dbM_sub_j = function(index, dbM_dimnames) { if(is.character(index)) return(index) j_names = dbM_dimnames[[2L]] return(j_names[index]) -} - - -# dbDataFrame #### -## rows only #### -#' @rdname hidden_aliases -#' @export -setMethod( - '[', signature(x = 'dbDataFrame', i = 'dbIndex', j = 'missing', drop = 'ANY'), - function(x, i, ..., drop = FALSE) { - x = reconnect(x) - if(any(is.na(x@key))) stopf('Set dbDataFrame key with `keyCol()` to subset on \'i\'') - - # numerics and logical - if(is.logical(i) | is.numeric(i)) { - if(is.logical(i)) i = which(i) - x@data = x@data |> - flex_window_order(x@key) |> - dplyr::mutate(.n = dplyr::row_number()) |> - dplyr::collapse() |> - dplyr::filter(.n %in% i) |> - dplyr::select(-.n) |> - dplyr::collapse() - } else { # character - x@data = x@data |> - flex_window_order(x@key) |> - dplyr::filter(!!as.name(x@key) %in% i) |> - dplyr::collapse() - } - x - }) - -## cols only #### -#' @rdname hidden_aliases -#' @export -setMethod('[', signature(x = 'dbDataFrame', i = 'missing', j = 'dbIndex', drop = 'ANY'), - function(x, j, ..., drop = FALSE) { - x = reconnect(x) - checkmate::assert_logical(drop) - - if(is.logical(j)) j = which(j) - x@data = x@data |> dplyr::select(dplyr::all_of(j)) - x - }) - -#' @rdname hidden_aliases -#' @export -setMethod('[', signature(x = 'dbDataFrame', i = 'dbIndex', j = 'dbIndex', drop = 'ANY'), - function(x, i, j, ..., drop = FALSE) { - x = reconnect(x) - x = x[i,] - x = x[, j] - x - }) - -## Empty #### -### Extract [] #### -#' @rdname hidden_aliases -#' @export -setMethod('[', signature(x = 'dbDataFrame', i = 'missing', j = 'missing', drop = 'missing'), - function(x, i, j) { - x@data - }) - -### Set [] #### -# no initialize to prevent slowdown -#' @rdname hidden_aliases -#' @export -setMethod('[<-', signature(x = 'dbDataFrame', i = 'missing', j = 'missing', value = 'ANY'), - function(x, i, j, value) { - x@data = value - x - }) - -#'@ flex_window_order -#' -#'@param x dbplyr::tbl_lazy -#'@param order_cols character vector of column names -#'@description -#'workaround for multiple dbplyr window column ordering -#' -#' -#'@keywords internal -flex_window_order = function(x, order_cols) { - keys = paste0('!!as.name("', order_cols, '")') - keys = paste0(keys, collapse = ', ') - call_str = paste0('x |> dbplyr::window_order(', keys, ')') - eval(str2lang(call_str)) -} +} \ No newline at end of file diff --git a/R/names.R b/R/names.R index 44a910a..f3894d3 100644 --- a/R/names.R +++ b/R/names.R @@ -1,19 +1,5 @@ # names #### -#' @rdname hidden_aliases -#' @export -setMethod('names', signature(x = 'dbDataFrame'), function(x) { - x = reconnect(x) - colnames(x) -}) - -#' @rdname hidden_aliases -#' @export -setMethod('names<-', signature(x = 'dbDataFrame', value = 'dbIndex'), function(x, value) { - x = reconnect(x) - dplyr_set_colnames(x, value = as.character(value)) -}) - # TODO ensure these match the row / col operations # rownames #### #' @rdname hidden_aliases @@ -61,14 +47,6 @@ setMethod('colnames<-', signature(x = 'dbMatrix'), function(x, value) { x@dim_names[[2]] = value x }) - -#' @rdname hidden_aliases -#' @export -setMethod('colnames<-', signature(x = 'dbDataFrame', value = 'dbIndex'), function(x, value) { - x = reconnect(x) - dplyr_set_colnames(x = x, value = as.character(value)) -}) - # dimnames #### #' @rdname hidden_aliases #' @export @@ -81,17 +59,4 @@ setMethod('dimnames', signature(x = 'dbMatrix'), function(x) { setMethod('dimnames<-', signature(x = 'dbMatrix', value = 'list'), function(x, value) { x@dim_names = value x -}) - -#' @rdname hidden_aliases -#' @export -setMethod('dimnames', signature(x = 'dbDataFrame'), function(x) { - dimnames(x[]) -}) - -#' @rdname hidden_aliases -#' @export -setMethod('dimnames<-', signature(x = 'dbDataFrame', value = 'list'), function(x, value) { - x = dplyr_set_colnames(x, value = as.character(value[[2]])) - x -}) +}) \ No newline at end of file diff --git a/R/operations.R b/R/operations.R index ad31e16..57e9ec7 100644 --- a/R/operations.R +++ b/R/operations.R @@ -614,14 +614,6 @@ setMethod('nrow', signature(x = 'dbMatrix'), function(x) { return(base::nrow(res)) }) -#' @title nrow -#' @rdname hidden_aliases -#' @export -setMethod('nrow', signature(x = 'dbDataFrame'), function(x) { - # x = reconnect(x) - dim(x)[1L] -}) - ### ncol #### #' @title ncol @@ -641,22 +633,6 @@ setMethod('ncol', signature(x = 'dbMatrix'), function(x) { return(base::nrow(res)) }) -#' @title ncol -#' @rdname hidden_aliases -#' @export -setMethod('ncol', signature(x = 'dbDataFrame'), function(x) { - # x = reconnect(x) - ncol(x@data) -}) - -#' @title ncol -#' @rdname hidden_aliases -#' @export -setMethod('ncol', signature(x = 'dbDataFrame'), function(x) { - # x = reconnect(x) - ncol(x@data) -}) - ### dim #### #' @title dim @@ -682,13 +658,6 @@ setMethod('head', signature(x = 'dbMatrix'), function(x, n = 6L, ...) { return(x) }) -#' @title head -#' @export -setMethod('head', signature(x = 'dbDataFrame'), function(x, n = 6L, ...) { - x[] = x[] %in% head(x, n = n) - return(x) -}) - ### tail #### #' @title tail #' @export @@ -699,13 +668,6 @@ setMethod('tail', signature(x = 'dbMatrix'), function(x, n = 6L, ...) { return(x) }) -#' @title tail -#' @export -setMethod('tail', signature(x = 'dbDataFrame'), function(x, n = 6L, ...) { - x[] = x[] %in% tail(x, n = n) - return(x) -}) - # Column data types #### # Due to how these functions will be commonly seen within other functions, a # call to `reconnect()` is omitted. diff --git a/R/simulate.R b/R/simulate.R index 42fea61..7cf0665 100644 --- a/R/simulate.R +++ b/R/simulate.R @@ -139,20 +139,6 @@ sim_ijx_matrix = function(mat_type = NULL, return(ijx) } -#' @describeIn simulate_objects Simulate a dbDataFrame in memory -#' @export -sim_dbDataFrame = function(value = NULL, name = 'df_test', key = NA_character_) { - if(is.null(data)) { - data = sim_duckdb(name = name) - } - if(!inherits(data, 'tbl_sql')) { - checkmate::assert_class(data, 'data.frame') - data = sim_duckdb(data = data, name = name) - } - dbDataFrame(data = data, remote_name = name, hash = 'ID_dummy', - init = TRUE, key = key) -} - #' @describeIn simulate_objects Simulate a dbSparseMatrix in memory #' @description Simulate a dbSparseMatrix in memory #' @export diff --git a/man/dbDataFrame-class.Rd b/man/dbDataFrame-class.Rd deleted file mode 100644 index df58a1c..0000000 --- a/man/dbDataFrame-class.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/classes.R -\docType{class} -\name{dbDataFrame-class} -\alias{dbDataFrame-class} -\alias{dbDataFrame} -\title{S4 dbDataFrame class} -\description{ -Representation of dataframes using an on-disk database. Each object -is used as a connection to a single table that exists within the database. -} -\section{Slots}{ - -\describe{ -\item{\code{data}}{dplyr tbl that represents the database data} - -\item{\code{hash}}{unique hash ID for backend} - -\item{\code{remote_name}}{name of table within database that contains the data} - -\item{\code{key}}{column to set as key for ordering and subsetting on i} -}} - diff --git a/man/flex_window_order.Rd b/man/flex_window_order.Rd deleted file mode 100644 index ec26eac..0000000 --- a/man/flex_window_order.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/extract.R -\name{flex_window_order} -\alias{flex_window_order} -\title{flex_window_order} -\usage{ -flex_window_order(x, order_cols) -} -\arguments{ -\item{x}{dbplyr::tbl_lazy} - -\item{order_cols}{character vector of column names} -} -\description{ -workaround for multiple dbplyr window column ordering -} -\keyword{internal} diff --git a/man/head-dbDataFrame-method.Rd b/man/head-dbDataFrame-method.Rd deleted file mode 100644 index 0335044..0000000 --- a/man/head-dbDataFrame-method.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/operations.R -\name{head,dbDataFrame-method} -\alias{head,dbDataFrame-method} -\title{head} -\usage{ -\S4method{head}{dbDataFrame}(x, n = 6L, ...) -} -\description{ -head -} diff --git a/man/hidden_aliases.Rd b/man/hidden_aliases.Rd index a9bbf5e..02b2b71 100644 --- a/man/hidden_aliases.Rd +++ b/man/hidden_aliases.Rd @@ -6,22 +6,12 @@ \alias{[,dbMatrix,dbIndex,missing,missing-method} \alias{[,dbMatrix,missing,dbIndex,missing-method} \alias{[,dbMatrix,dbIndex,dbIndex,missing-method} -\alias{[,dbDataFrame,dbIndex,missing,ANY-method} -\alias{[,dbDataFrame,missing,dbIndex,ANY-method} -\alias{[,dbDataFrame,dbIndex,dbIndex,ANY-method} -\alias{[,dbDataFrame,missing,missing,missing-method} -\alias{[<-,dbDataFrame,missing,missing,ANY-method} -\alias{names,dbDataFrame-method} -\alias{names<-,dbDataFrame,dbIndex-method} \alias{rownames,dbMatrix-method} \alias{rownames<-,dbMatrix-method} \alias{colnames,dbMatrix-method} \alias{colnames<-,dbMatrix,ANY-method} -\alias{colnames<-,dbDataFrame,dbIndex-method} \alias{dimnames,dbMatrix-method} \alias{dimnames<-,dbMatrix,list-method} -\alias{dimnames,dbDataFrame-method} -\alias{dimnames<-,dbDataFrame,list-method} \alias{Arith,dbMatrix,ANY-method} \alias{Arith,ANY,dbMatrix-method} \alias{Arith,dbMatrix,dbMatrix-method} @@ -44,9 +34,7 @@ \alias{mean,dbSparseMatrix-method} \alias{log,dbMatrix-method} \alias{t,dbMatrix-method} -\alias{nrow,dbDataFrame-method} \alias{ncol,dbMatrix-method} -\alias{ncol,dbDataFrame-method} \alias{dim,dbMatrix-method} \title{rowSums} \usage{ @@ -60,20 +48,6 @@ \S4method{[}{dbMatrix,dbIndex,dbIndex,missing}(x, i, j, ..., drop = TRUE) -\S4method{[}{dbDataFrame,dbIndex,missing,ANY}(x, i, j, ..., drop = TRUE) - -\S4method{[}{dbDataFrame,missing,dbIndex,ANY}(x, i, j, ..., drop = TRUE) - -\S4method{[}{dbDataFrame,dbIndex,dbIndex,ANY}(x, i, j, ..., drop = FALSE) - -\S4method{[}{dbDataFrame,missing,missing,missing}(x, i, j) - -\S4method{[}{dbDataFrame,missing,missing,ANY}(x, i, j) <- value - -\S4method{names}{dbDataFrame}(x) - -\S4method{names}{dbDataFrame,dbIndex}(x) <- value - \S4method{rownames}{dbMatrix}(x) \S4method{rownames}{dbMatrix}(x) @@ -86,16 +60,10 @@ \S4method{colnames}{dbMatrix,ANY}(x) <- value -\S4method{colnames}{dbDataFrame,dbIndex}(x) <- value - \S4method{dimnames}{dbMatrix}(x) \S4method{dimnames}{dbMatrix,list}(x) <- value -\S4method{dimnames}{dbDataFrame}(x) - -\S4method{dimnames}{dbDataFrame,list}(x) <- value - \S4method{Arith}{dbMatrix,ANY}(e1, e2) \S4method{Arith}{ANY,dbMatrix}(e1, e2) @@ -140,14 +108,8 @@ \S4method{t}{dbMatrix}(x) -\S4method{nrow}{dbDataFrame}(x) - \S4method{ncol}{dbMatrix}(x) -\S4method{ncol}{dbDataFrame}(x) - -\S4method{ncol}{dbDataFrame}(x) - \S4method{dim}{dbMatrix}(x) } \description{ @@ -183,12 +145,6 @@ log Transpose -nrow - -ncol - -ncol - ncol dim diff --git a/man/simulate_objects.Rd b/man/simulate_objects.Rd index c513ed6..5da47d8 100644 --- a/man/simulate_objects.Rd +++ b/man/simulate_objects.Rd @@ -5,7 +5,6 @@ \alias{sim_dgc} \alias{sim_denseMat} \alias{sim_ijx_matrix} -\alias{sim_dbDataFrame} \alias{sim_dbSparseMatrix} \alias{sim_dbDenseMatrix} \title{sim_dgc} @@ -18,8 +17,6 @@ sim_denseMat(num_rows = 50, num_cols = 50) sim_ijx_matrix(mat_type = NULL, num_rows = 50, num_cols = 50, seed_num = 42) -sim_dbDataFrame(value = NULL, name = "df_test", key = NA_character_) - sim_dbSparseMatrix( num_rows = 50, num_cols = 50, @@ -72,8 +69,6 @@ a non-zero value. \item \code{sim_ijx_matrix()}: Simulate a duckdb connection dplyr tbl_Pool in memory -\item \code{sim_dbDataFrame()}: Simulate a dbDataFrame in memory - \item \code{sim_dbSparseMatrix()}: Simulate a dbSparseMatrix in memory \item \code{sim_dbDenseMatrix()}: Simulate a dbDenseMatrix in memory diff --git a/man/tail-dbDataFrame-method.Rd b/man/tail-dbDataFrame-method.Rd deleted file mode 100644 index 6cb717c..0000000 --- a/man/tail-dbDataFrame-method.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/operations.R -\name{tail,dbDataFrame-method} -\alias{tail,dbDataFrame-method} -\title{tail} -\usage{ -\S4method{tail}{dbDataFrame}(x, n = 6L, ...) -} -\description{ -tail -} diff --git a/stache_tests/testthat/test-connection.R b/stache_tests/testthat/test-connection.R deleted file mode 100644 index fe586ac..0000000 --- a/stache_tests/testthat/test-connection.R +++ /dev/null @@ -1,43 +0,0 @@ - -test_that('driver extracts working DB driver call', { - dbmat = dbMatrix() - drv_call = driver(dbmat) - expect_true(inherits(drv_call, 'call')) - drv = eval(drv_call) - expect_true(DBI::dbCanConnect(drv)) - duckdb::duckdb_shutdown(drv) -}) - -test_that('remoteValid works', { - dbmat = dbMatrix() - expect_true(remoteValid(dbmat)) - dbmat = disconnect(dbmat) - expect_false(remoteValid(dbmat)) -}) - - - -dbmat = dbMatrix() - -test_that('remoteListTables works', { - expect_identical('test', remoteListTables(dbmat)) - expect_identical('test', remoteListTables(connection(dbmat))) -}) - -test_that('remoteName works', { - rn = remoteName(dbmat) - expect_identical('test', rn) - expect_true(inherits(rn, 'character')) -}) - -test_that('remoteExistsTable works', { - expect_false(remoteExistsTable(dbmat, 'not_there')) - expect_false(remoteExistsTable(connection(dbmat), 'not_there')) - expect_true(remoteExistsTable(dbmat, 'test')) - expect_true(remoteExistsTable(connection(dbmat), 'test')) -}) - - - - -dbmat = disconnect(dbmat) diff --git a/stache_tests/testthat/test-extract.R b/stache_tests/testthat/test-extract.R deleted file mode 100644 index 6b12406..0000000 --- a/stache_tests/testthat/test-extract.R +++ /dev/null @@ -1,12 +0,0 @@ - -test_that('Empty bracket extracts data slot', { - dbmat = new('dbMatrix') - expect_identical(dbmat[], dbmat@data) - dbmat = disconnect(dbmat) -}) - -test_that('Empty bracket extracts data slot', { - dbDF = new('dbDataFrame') - expect_identical(dbDF[], dbDF@data) - # dbDF = disconnect(dbDF) -}) diff --git a/stache_tests/testthat/test-query.R b/stache_tests/testthat/test-query.R deleted file mode 100644 index a79d7d7..0000000 --- a/stache_tests/testthat/test-query.R +++ /dev/null @@ -1,27 +0,0 @@ - -dbmat = new('dbMatrix') -dbDF = new('dbDataFrame') - -test_that('queryStack extracts query info from dbMatrix', { - expect_identical(queryStack(dbmat), dbmat@data$lazy_query) -}) - -test_that('queryStack extracts query info from dbDataFrame', { - expect_identical(queryStack(dbDF), dbDF@data$lazy_query) -}) - - -test_that('queryStack<- replaces query info from dbMatrix', { - dbmat = new('dbMatrix') - queryStack(dbmat) = 'test' - expect_identical(queryStack(dbmat), 'test') -}) - -test_that('queryStack<- replaces query info from dbMatrix', { - dbDF = new('dbDataFrame') - queryStack(dbDF) = 'test' - expect_identical(queryStack(dbDF), 'test') -}) - -dbmat = disconnect(dbmat) -# dbDF = disconnect(dbDF) diff --git a/stache_tests/testthat/test-spatchunk.R b/stache_tests/testthat/test-spatchunk.R deleted file mode 100644 index 2d8c4c3..0000000 --- a/stache_tests/testthat/test-spatchunk.R +++ /dev/null @@ -1,23 +0,0 @@ - -dbpoly = simulate_dbPolygonProxy() - -test_that('spatial chunking selects the correct number of polys', { - ext_list = chunk_plan(extent = ext(dbpoly), min_chunks = 4L) - expected_len = nrow(dbpoly) - - # chunk data - chunk_x_list = lapply( - ext_list, - function(e) { - # 'soft' selections on top and right - extent_filter(x = dbpoly, extent = e, include = c(TRUE, TRUE, FALSE, FALSE), - method = 'mean') - } - ) - - chunk_x_list_len = lapply(chunk_x_list, nrow) - select_len_sum = do.call(sum, chunk_x_list_len) - - expect_equal(select_len_sum, expected_len) -}) - From 5dbd9265f5d7445bcdfcd77d2a992d1fe5da99ce Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Mon, 5 Aug 2024 14:29:38 -0400 Subject: [PATCH 13/27] fix: factor support in vect_multi --- R/operations.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/operations.R b/R/operations.R index 57e9ec7..c290e73 100644 --- a/R/operations.R +++ b/R/operations.R @@ -36,9 +36,9 @@ arith_call_dbm_vect_multi = function(dbm, num_vect, generic_char, ordered_args) # handle dimnames r_names = rownames(dbm) - if (is.factor(r_names)) { - r_names = 1:length(r_names) - } + # if (is.factor(r_names)) { + # r_names = 1:length(r_names) + # } # perform matching of vect by rownames on dbm vect_tbl = dplyr::tibble(i = match(names(num_vect), r_names), From 920aed41f7db61ffb1be6304cb850cc55d27a807 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Mon, 5 Aug 2024 14:29:50 -0400 Subject: [PATCH 14/27] refactor: precompute() --- R/precompute.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/precompute.R b/R/precompute.R index f8e0bd4..06af458 100644 --- a/R/precompute.R +++ b/R/precompute.R @@ -39,7 +39,7 @@ precompute <- function(conn, m, n, name){ } .check_name(name = name) - if(!(is.numeric(m) | is.integer(m)) | !(is.numeric(n) | is.integer(n))){ + if(!(is.numeric(m)) || !(is.numeric(n))){ stop("m and n must be integers or numerics") } From e6e7f92e48820e08a208e741e865c30a19645416 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:35:34 -0700 Subject: [PATCH 15/27] chore: update DESCRIPTION, NAMESPACE --- DESCRIPTION | 5 +++-- NAMESPACE | 11 ++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5a3e306..ba681f2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,9 +15,10 @@ URL: https://drieslab.github.io/dbMatrix/ BugReports: https://github.com/drieslab/dbMatrix/issues RoxygenNote: 7.3.2 Depends: - R (>= 4.1.0), - MatrixGenerics (>= 1.12.3) + R (>= 4.1.0) Imports: + MatrixGenerics (>= 1.12.3), + methods, Matrix, DBI, dplyr, diff --git a/NAMESPACE b/NAMESPACE index 51e68bc..6cf1173 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,9 +3,6 @@ export(dbDenseMatrix) export(dbMatrix) export(dbSparseMatrix) -export(precompute) -export(readMM) -export(read_matrix) export(sim_dbDenseMatrix) export(sim_dbSparseMatrix) exportClasses(dbDenseMatrix) @@ -23,8 +20,6 @@ exportMethods(colSds) exportMethods(colSums) exportMethods(colTypes) exportMethods(colnames) -exportMethods(dbDisconnect) -exportMethods(dbListTables) exportMethods(dim) exportMethods(dimnames) exportMethods(head) @@ -38,4 +33,10 @@ exportMethods(rowSums) exportMethods(rownames) exportMethods(t) exportMethods(tail) +importFrom(MatrixGenerics,colMeans) +importFrom(MatrixGenerics,colSds) +importFrom(MatrixGenerics,colSums) +importFrom(MatrixGenerics,rowMeans) +importFrom(MatrixGenerics,rowSds) +importFrom(MatrixGenerics,rowSums) importFrom(data.table,":=") From 1de3c317f3f6f2485117bb5fbbc2469bacdbfecd Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:35:42 -0700 Subject: [PATCH 16/27] docs: update pkgdown --- _pkgdown.yml | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index f511bc3..05d0f0e 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,22 +1,25 @@ url: https://drieslab.github.io/dbMatrix/ template: bootstrap: 5 -navbar: - title: ~ - type: inverse - left: - - text: Home - href: index.html - - text: Articles - menu: - - text: Overview - href: articles/overview.html - - text: Operations - href: articles/operations.html - - text: Arithmetic - href: articles/arithmetic.html - - text: Documentation - href: reference/index.html - - text: Changelog - href: news/index.html + light-switch: true +reference: + - title: "dbMatrix objects" + desc: "Functions for `dbMatrix` objects" + contents: + - has_concept("dbMatrix") + + - title: "Matrix Summary Statistics" + desc: "Functions/methods to compute summary statistics for `dbMatrix` objects" + contents: + - has_concept("summary") + + - title: "Matrix Transformations" + desc: "Functions/methods for transforming `dbMatrix` objects" + contents: + - has_concept("transform") + + - title: "Matrix Properties" + desc: "Functions/methods to get basic properties of `dbMatrix` objects" + contents: + - has_concept("matrix_props") From 3f5713d750333c7f78bcbf4f780ccd582fd0829b Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:36:48 -0700 Subject: [PATCH 17/27] refactor: show() and densification workflow --- R/dbMatrix.R | 730 ++++++++++++++++++--------------------------------- 1 file changed, 256 insertions(+), 474 deletions(-) diff --git a/R/dbMatrix.R b/R/dbMatrix.R index a5346af..0308e40 100644 --- a/R/dbMatrix.R +++ b/R/dbMatrix.R @@ -38,12 +38,15 @@ setMethod( # show #### -## dbDenseMatrix #### -# Method for initializing dbMatrix. Concerns only the processing that is related -# to elements internal to the object. -setMethod('show', signature(object = 'dbDenseMatrix'), function(object) { - cat('connection : ', get_dbdir(object), '\n') - cat('table name : \'', get_tblName(object), '\'\n', sep = '') +## dbDenseMatrix #### +setMethod('show', signature('dbDenseMatrix'), function(object) { + grey_color <- crayon::make_style("grey60") + # cat(grey_color("# Connection:", get_dbdir(object), "\n")) + # + # tbl_name <- dbplyr::remote_name(object[]) + # if(!is.null(tbl_name)){ + # cat(grey_color("# Name: \'", tbl_name, '\'\n', sep = '')) + # } row_names = rownames(object) col_names = colnames(object) @@ -56,7 +59,7 @@ setMethod('show', signature(object = 'dbDenseMatrix'), function(object) { if(identical(dims, c(0L, 0L))) { cat('0 x 0 matrix of class "dbDenseMatrix"\n') - return() # exit early if no info + return() } else { cat(dim_row, 'x', dim_col, ' matrix of class "dbDenseMatrix"\n') } @@ -67,7 +70,7 @@ setMethod('show', signature(object = 'dbDenseMatrix'), function(object) { # print colnames colname_show_n = dim_col - 6L if(colname_show_n < 0L) { - message('Colnames: ', vector_to_string(col_names)) + message('[[ Colnames: ', vector_to_string(col_names), ' ]]') } else if(colname_show_n >= 1L) { message( '[[ Colnames ', @@ -79,107 +82,70 @@ setMethod('show', signature(object = 'dbDenseMatrix'), function(object) { } # get matrix i and j to print - # p_coln = head(col_names, 10L) - if(dim_col > 6) { - p_coln = c(1:3, (dim_col-2):dim_col) + suppress_rows = FALSE # flag for whether rows are being suppressed + if(dim_col - 10L > 0L) { + p_coln = c(head(col_names, 10L)) } else { p_coln = col_names } - + p_coln = head(col_names, 10L) if(dim_row - 6L > 0L) { - # p_rown = c(head(row_names, 3L), tail(row_names, 3L)) - p_rown = c(1:3, (dim_row-2):dim_row) + p_rown = c(head(row_names, 3L), tail(row_names, 3L)) + suppress_rows = TRUE } else { p_rown = row_names - # p_rown = 1:length(row_names) } filter_i = sapply(p_rown, function(f_i) which(f_i == row_names)) filter_j = sapply(p_coln, function(f_j) which(f_j == col_names)) # prepare subset to print - preview_dt = object@value |> + preview_tbl = object@value |> dplyr::filter(i %in% filter_i & j %in% filter_j) |> dplyr::collect() # ij indices for printing - a_i = sapply(preview_dt$i, function(i_idx) which(row_names[i_idx] == p_rown)) - a_j = sapply(preview_dt$j, function(j_idx) which(col_names[j_idx] == p_coln)) + a_i = sapply(preview_tbl$i, function(i_idx) which(row_names[i_idx] == p_rown)) + a_j = sapply(preview_tbl$j, function(j_idx) which(col_names[j_idx] == p_coln)) if (length(a_i) == 0L) a_i = NULL if (length(a_j) == 0L) a_j = NULL - a_x <- NULL - if (length(preview_dt$x) != 0L) { # catch sparse case where if/else: null - a_x <- preview_dt$x - } - if(nrow(preview_dt) > 0) { - preview_dt = data.table::dcast(preview_dt, formula = i ~ j, value.var = 'x') - } else { - print("") # TODO update this for sparse matrix + if (length(preview_tbl$x) != 0L) { # catch sparse case where if/else: null + a_x <- round(preview_tbl$x, digits = 5) } - suppress_rows = FALSE + # print matrix values if(suppress_rows) { + # suppressed lines: capture, split, then print individually + # when suppressed, currently hardcoded to show 3 from head and 3 from tail + a_out = capture.output(print_array(i = a_i, + j = a_j, + x = a_x, + dims = c(length(p_rown), length(p_coln)), + rownames = p_rown)) + writeLines(a_out[1:4]) - output = as.matrix(preview_dt[1:6,2:7]) - rownames(output) = as.matrix(preview_dt[,1]) - - top_left <- output[1:3, 1:3] |> - format(scientific = TRUE, digits = 2) - top_right <- output[1:3, (ncol(output)-2):ncol(output)] |> - format(scientific = TRUE, digits = 2) - bottom_left <- output[(nrow(output)-2):nrow(output), 1:3] |> - format(scientific = TRUE, digits = 2) - bottom_right <- output[(nrow(output)-2):nrow(output), - (ncol(output)-2):ncol(output)] |> - format(scientific = TRUE, digits = 2) - - # Add spacing - pad_names <- function(vector, max_length = 8) { - sapply(vector, function(x) { - padding <- max_length - nchar(x) - left_pad <- floor(padding / 2) - right_pad <- ceiling(padding / 2) - paste0(strrep(" ", left_pad), x, strrep(" ", right_pad)) - }) - } - - # apply proper padding - ellipsis_row <- c(mapply(pad_names, rep('⋮', 3), 8), - mapply(pad_names, "⋮", 2), - mapply(pad_names, rep(' ⋮', 3), 8)) |> crayon::silver() - ellipsis_col_top <- matrix(rep(" … ", 3), ncol = 1) |> crayon::silver() - ellipsis_col_bot <- matrix(rep(" … ", 3), ncol = 1) |> crayon::silver() - - combined <- rbind( - cbind(top_left, ellipsis_col_top, top_right), - ellipsis_row, - cbind(bottom_left, ellipsis_col_bot, bottom_right) - ) + dim_col_out = dim_col - 10L + dim_row_out = dim_row - 6L - # format dim names - rownames(combined) <- crayon::blue( - c(rownames(top_left), - "⋮", - rownames(bottom_left) - ) - ) - - colnames(combined) <- crayon::blue( - c(pad_names(colnames(top_left), 9), - " … ", - pad_names(colnames(top_right), 9)) - ) + if(dim_col_out < 0){ + sprintf('\n......suppressing %d rows\n\n', dim_row_out) |> + cat() + } else if (dim_col_out == 0) { + sprintf('\n...... suppressing %d rows ......\n\n', dim_row_out) |> + cat() + } + else { + sprintf('\n......suppressing %d columns and %d rows\n\n', + dim_col_out, dim_row_out) |> + cat() + } - write.table(combined, - quote = FALSE, - row.names = TRUE, - col.names = NA, - sep = " ", - file = "") + writeLines(a_out[5:7]) } else { + # no suppressed lines: Directly print print_array( i = a_i, j = a_j, @@ -189,14 +155,14 @@ setMethod('show', signature(object = 'dbDenseMatrix'), function(object) { ) } - cat('\n') - }) + ## dbSparseMatrix #### setMethod('show', signature('dbSparseMatrix'), function(object) { - cat('connection : ', get_dbdir(object), '\n') - cat('table name : \'', get_tblName(object), '\'\n', sep = '') + grey_color <- crayon::make_style("grey60") + # cat(grey_color("# Connection:", get_dbdir(object), "\n")) + # cat(grey_color("# Name: \'", get_tblName(object), '\'\n', sep = '')) row_names = rownames(object) col_names = colnames(object) @@ -209,7 +175,7 @@ setMethod('show', signature('dbSparseMatrix'), function(object) { if(identical(dims, c(0L, 0L))) { cat('0 x 0 matrix of class "dbSparseMatrix"\n') - return() # exit early if no info + return() } else { cat(dim_row, 'x', dim_col, ' matrix of class "dbSparseMatrix"\n') } @@ -220,7 +186,7 @@ setMethod('show', signature('dbSparseMatrix'), function(object) { # print colnames colname_show_n = dim_col - 6L if(colname_show_n < 0L) { - message('Colnames: ', vector_to_string(col_names)) + message('[[ Colnames: ', vector_to_string(col_names), ' ]]') } else if(colname_show_n >= 1L) { message( '[[ Colnames ', @@ -260,10 +226,10 @@ setMethod('show', signature('dbSparseMatrix'), function(object) { if (length(a_i) == 0L) a_i = NULL if (length(a_j) == 0L) a_j = NULL - a_x <- NULL + if (length(preview_tbl$x) != 0L) { # catch sparse case where if/else: null - a_x <- preview_tbl$x + a_x <- round(preview_tbl$x, digits = 5) } # print matrix values @@ -277,9 +243,21 @@ setMethod('show', signature('dbSparseMatrix'), function(object) { rownames = p_rown)) writeLines(a_out[1:4]) - sprintf('\n......suppressing %d columns and %d rows\n\n', - dim_col - 10L, dim_row - 6L) |> - cat() + dim_col_out = dim_col - 10L + dim_row_out = dim_row - 6L + + if(dim_col_out < 0){ + sprintf('\n......suppressing %d rows\n\n', dim_row_out) |> + cat() + } else if (dim_col_out == 0) { + sprintf('\n.......... suppressing %d rows ..........\n\n', dim_row_out) |> + cat() + } + else { + sprintf('\n......suppressing %d columns and %d rows\n\n', + dim_col_out, dim_row_out) |> + cat() + } writeLines(a_out[5:7]) } else { @@ -293,28 +271,11 @@ setMethod('show', signature('dbSparseMatrix'), function(object) { ) } - - # if(nrow(preview_dt < 7L)) { - # print(preview_dt, digits = 5L, row.names = 'none') - # } else { - # print(preview_dt[1L:3L,], digits = 5L, row.names = 'none') - # - # sprintf(' ........suppressing %d columns and %d rows\n', - # object@dims[[2L]] - 10L, dim_row - 6L) - # - # print(preview_dt[4L:6L,], digits = 5L, row.names = 'none') - # } - # - # cat('\n') - }) # constructors #### -# Basic function to generate a dbMatrix obj given input data - -#' @title Create a sparse or dense dbMatrix object -#' @name dbMatrix +#' @title Create a sparse or dense dbMatrix objects #' @description #' Create an S4 \code{dbMatrix} object in sparse or dense triplet vector format. #' @param value data to be added to the database. See details for supported data types \code{(required)} @@ -331,28 +292,30 @@ setMethod('show', signature('dbSparseMatrix'), function(object) { #' database. by default, no header is assumed. \code{(optional)} #' @param mtx_colname_col_idx column index of column name file \code{(optional)} #' @param ... additional params to pass -#' @details This function reads in data into a pre-existing DuckDB database. Based -#' on the \code{name} and \code{db_path} a lazy connection is then made -#' downstream during \code{dbMatrix} initialization. -#' +#' @details This function reads in data into a pre-existing DuckDB database. #' Supported \code{value} data types: #' \itemize{ #' \item \code{dgCMatrix} In-memory sparse matrix from the \code{Matrix} package #' \item \code{dgTMatrix} In-memory triplet vector or COO matrix #' \item \code{matrix} In-memory dense matrix from base R -#' \item \code{.mtx} Path to .mtx file (TODO) -#' \item \code{.csv} Path to .csv file (TODO) +#' \item \code{.mtx} Path to .mtx file +#' \item \code{.csv} Path to .csv file #' \item \code{tbl_duckdb_connection} Table in DuckDB database in ijx format from #' existing \code{dbMatrix} object. \code{dims} and \code{dim_names} must be #' specified if \code{value} is \code{tbl_duckdb_connection}. #' } -#' +#' @concept dbMatrix #' @export #' @examples -#' dgc <- dbMatrix:::sim_dgc() -#' dbSparse <- dbMatrix(value = dgc, db_path = ":memory:", -#' name = "sparse_matrix", class = "dbSparseMatrix", -#' overwrite = TRUE) +#' dgc = readRDS(system.file("data", "dgc.rds", package = "dbMatrix")) +#' con <- DBI::dbConnect(duckdb::duckdb(), ":memory:") +#' dbSparse <- dbMatrix( +#' value = dgc, +#' con = con, +#' name = "sparse_matrix", +#' class = "dbSparseMatrix", +#' overwrite = TRUE +#' ) #' dbSparse dbMatrix <- function(value, class = NULL, @@ -379,18 +342,18 @@ dbMatrix <- function(value, # check class if (is.null(class)) { - stop("Invalid class: choose 'dbDenseMatrix' or 'dbSparseMatrix'") + stopf("Invalid class: choose 'dbDenseMatrix' or 'dbSparseMatrix'") } if (!is.character(class) | !(class %in% c("dbDenseMatrix", "dbSparseMatrix"))) { - stop("Invalid class: choose 'dbDenseMatrix' or 'dbSparseMatrix'") + stopf("Invalid class: choose 'dbDenseMatrix' or 'dbSparseMatrix'") } # check value and class mismatch - if(inherits(value, "matrix") & class == "dbSparseMatrix"){ - stop("Class mismatch: set class to 'dbDenseMatrix' for dense matrices") + if((inherits(value, "matrix") | inherits(value, "denseMatrix")) & class == "dbSparseMatrix"){ + stopf("Class mismatch: set class to 'dbDenseMatrix' for dense matrices") } - if(inherits(value, "dgCMatrix") & class == "dbDenseMatrix"){ - stop("Class mismatch: set class to 'dbSparseMatrix' for sparse matrices") + if((inherits(value, "dgCMatrix") | inherits(value, "sparseMatrix")) & class == "dbDenseMatrix"){ + stopf("Class mismatch: set class to 'dbSparseMatrix' for sparse matrices") } # check dims, dim_names @@ -486,13 +449,12 @@ dbMatrix <- function(value, # converters #### #' @title Convert dbSparseMatrix to dbDenseMatrix -#' @name toDbDense #' @description #' Convert a dbSparseMatrix to a dbDenseMatrix. #' #' @param db_sparse dbSparseMatrix object. #' -#' @examples TODO +#' @noRd #' #' @keywords internal toDbDense <- function(db_sparse){ @@ -512,261 +474,95 @@ toDbDense <- function(db_sparse){ db_path <- get_dbdir(db_sparse) # see ?dbMatrix::precompute() for more details - precompute_mat <- getOption("dbMatrix.precomp", default = NULL) - - if(!is.null(precompute_mat)){ + precompute_name <- getOption("dbMatrix.precomp", default = NULL) - # if precompute_mat is not atable in con stop throw error - if(!precompute_mat %in% DBI::dbListTables(con)){ - stop("Precomputed matrix is not a valid table in the connection.") - } + if(!is.null(precompute_name) && (precompute_name %in% DBI::dbListTables(con))){ - precomp <- dplyr::tbl(con, precompute_mat) + precomp <- dplyr::tbl(con, precompute_name) # get the max i value in precomp tbl n_rows_pre <- precomp |> dplyr::summarize(n_rows = max(i)) |> - dplyr::collect() |> dplyr::pull(n_rows) n_cols_pre <- precomp |> dplyr::summarize(n_cols = max(j)) |> - dplyr::collect() |> dplyr::pull(n_cols) # to prevent 1e4 errors and allow >int32 - n_rows_pre = bit64::as.integer64(n_rows_pre) - n_cols_pre = bit64::as.integer64(n_cols_pre) - - if(n_rows_pre < n_rows | n_cols_pre < n_cols){ - message <- glue::glue( - "Precomputed matrix dimensions exceeded. Generate a larger - precomputed matrix with at least {n_rows} rows and {n_cols} columns, - see ?precompute for more details. - - Alternatively, set 'options(dbMatrix.precomp = NULL)' to remove - the precomputed matrix and densify the dbSparseMatrix on the fly - (slow, not recommended). + n_rows_pre <- bit64::as.integer64(n_rows_pre) + n_cols_pre <- bit64::as.integer64(n_cols_pre) + + if(n_rows_pre < n_rows || n_cols_pre < n_cols){ + cli::cli_alert_warning( + "Generating a larger precomputed dbMatrix with {n_rows} rows and {n_cols} columns, + see ?precompute for more details. \n ") - stop(message) + uni_name <- unique_table_name(prefix = "precomp") + precomp <- precompute( + conn = con, + m = n_rows, + n = n_cols, + name = uni_name + ) } - # input validation - # if(!file.exists(precompute_mat)){ - # stop("Precomputed matrix file does not exist. Check for valid file path.") - # } - # con = get_con(db_sparse) - # con2 = DBI::dbConnect(duckdb::duckdb(), precompute_mat) - # - # precompute_mat = dplyr::tbl(con2, precompute_mat_name) - # - # dim_precomp = dim(precompute_mat)[1] * dim(precompute_mat)[2] - # - # dim_sparse = dim(db_sparse)[1] * dim(db_sparse)[2] - # - # if(dim_precomp < dim_sparse){ - # stop(paste0("Precomputed matrix is too small. - # Download or generate larger precomputed matrix with at least ", - # dim(db_sparse)[1], " rows and ", - # dim(db_sparse)[2], " columns.")) - # } - - # attach con2 to con of db_sparse - # db_dense <- dplyr::tbl(db_sparse, precompute_mat) - - # possible errors (and solutions) - # file doesn't exist (check if path is correct, set again with options(...)) - # precomp matrix dimensionality < db_sparse dimensionality (download or generate larger precomp matrix) - # precomp matrix is not a valid dbDenseMatrix object (check if precomp matrix is valid dbDenseMatrix object) - - # algo - # 1. check if precomp matrix exists - # 2. check if precomp matrix is valid dbDenseMatrix object - # 3. check if precomp matrix dim satisfies db_sparse-->db_dense dim - # 4. attach precomp matrix to db_sparse - # 5. return db_sparse - - key <- precomp |> - dplyr::filter(i <= n_rows, j <= n_cols) |> # filter out rows and cols that are not in db_sparse - dplyr::mutate(x = 0) - - data <- key |> - dplyr::left_join(db_sparse[], by = c("i", "j"), suffix = c("", ".dgc")) |> - dplyr::mutate(x = ifelse(is.na(x.dgc), x, x.dgc)) |> - dplyr::select(-x.dgc) - - # data |> dplyr::compute(temporary = F) - - # tictoc::tic() - # query <- glue::glue(' - # SELECT - # {precompute_mat}.i, - # {precompute_mat}.j, - # COALESCE({remote_name}.x, {precompute_mat}.x) as x - # FROM - # ( - # SELECT * - # FROM {precompute_mat} - # WHERE i <= {n_rows} AND j <= {n_cols} - # ) as {precompute_mat} - # LEFT JOIN {remote_name} - # ON {precompute_mat}.i = {remote_name}.i - # AND {precompute_mat}.j = {remote_name}.j - # ') - # - # dplyr::tbl(con, dplyr::sql(query)) |> - # dplyr::compute(temporary=F) - # tictoc::toc(); - - # Create new dbSparseMatrix object - db_dense <- new(Class = "dbDenseMatrix", - value = data, - name = remote_name, - dims = dims, - dim_names = dim_names, - init = TRUE) - } else{ # generate dbDenseMatrix from scratch cli::cli_alert_info(paste( - 'Densifying "dbSparseMatrix" on the fly...', - "For large matrices, see ?precompute to speed up densification.", + "Densifying 'dbSparseMatrix' on the fly...", sep = "\n", collapse = "" )) # to prevent 1e4 errors and allow >int32 - n_rows = bit64::as.integer64(n_rows) - n_cols = bit64::as.integer64(n_cols) - - # Generate i and j vectors from scratch - sql_i <- glue::glue("SELECT i FROM generate_series(1, {n_rows}) AS t(i)") - sequence_i <- dplyr::tbl(con, dplyr::sql(sql_i)) - - sql_j <- glue::glue("SELECT j FROM generate_series(1, {n_cols}) AS t(j)") - sequence_j <- dplyr::tbl(con, dplyr::sql(sql_j)) - - key <- sequence_i |> - dplyr::cross_join(sequence_j) |> - dplyr::mutate(x = 0) - - data <- key |> - dplyr::left_join(db_sparse[], by = c("i", "j"), suffix = c("", ".dgc")) |> - dplyr::mutate(x = ifelse(is.na(x.dgc), x, x.dgc)) |> - dplyr::select(-x.dgc) - - # Create new dbSparseMatrix object - db_dense <- new(Class = "dbDenseMatrix", - value = data, - name = remote_name, - dims = dims, - dim_names = dim_names, - init = TRUE) + n_rows <- bit64::as.integer64(n_rows) + n_cols <- bit64::as.integer64(n_cols) + + # precompute the matrix + uni_name <- unique_table_name(prefix = "precomp") + precomp <- precompute( + conn = con, + m = n_rows, + n = n_cols, + name = uni_name, + verbose = FALSE + ) } + key <- precomp |> + dplyr::filter(i <= n_rows, j <= n_cols) |> # filter out rows and cols that are not in db_sparse + dplyr::mutate(x = 0) + + data <- key |> + dplyr::left_join(db_sparse[], by = c("i", "j"), suffix = c("", ".dgc")) |> + dplyr::mutate(x = ifelse(is.na(x.dgc), x, x.dgc)) |> + dplyr::select(-x.dgc) + + # Create new dbSparseMatrix object + db_dense <- new( + Class = "dbDenseMatrix", + value = data, + name = remote_name, + dims = dims, + dim_names = dim_names, + init = TRUE + ) + # cat("done \n") return(db_dense) } -#' @name toDbSparse #' @description #' Convert a dbDenseMatrix to a dbSparseMatrix on disk using SQL. #' @param db_dense dbDenseMatrix object to convert to dbSparseMatrix #' @noRd #' @keywords internal toDbSparse <- function(db_dense){ - stopf("TODO") - # # check if db_dense is a dbDenseMatrix - # if (!inherits(db_dense, "dbDenseMatrix")) { - # stop("dbDenseMatrix object conversion currently only supported") - # } - # - # # Setup - # con <- cPool(db_dense) - # dims <- db_dense@dims - # remote_name <- db_dense@remote_name - # n_rows <- dims[1] - # n_cols <- dims[2] - # - # # Create a table with all possible combinations of 'i' and 'j' indices - # sql <- paste("CREATE TABLE all_indices AS - # SELECT i.i, j.j - # FROM (SELECT generate_series(1, ?) AS i) AS i - # CROSS JOIN (SELECT generate_series(1, ?) AS j) AS j") - # - # DBI::dbExecute( - # conn = con, - # statement = sql, - # params = list(n_rows, n_cols), - # overwrite = TRUE - # ) - # - # # Create a table with all unique 'i' and 'j' indices from - # # the dbSparseMatrix table - # sql <- paste("CREATE TABLE unique_indices AS - # SELECT DISTINCT i, j - # FROM", remote_name) - # - # DBI::dbExecute( - # conn = con, - # statement = sql) - # - # # Perform a CROSS JOIN between the unique 'i' and 'j' indices to - # # create a new table with the missing combinations - # sql <- paste("CREATE TABLE missing_combinations AS - # SELECT i.i, j.j - # FROM unique_indices AS i - # CROSS JOIN unique_indices AS j - # WHERE NOT EXISTS( - # SELECT 1 - # FROM ", remote_name, " - # WHERE ", paste0(remote_name, ".i"), "=i.i AND", - # paste0(remote_name, ".j"), "= j.j)") - # - # DBI::dbExecute( - # conn = con, - # statement = sql - # ) - # - # # Remove the temporary tables - # DBI::dbRemoveTable(conn = con, name = "all_indices") - # DBI::dbRemoveTable(conn = con, name = "unique_indices") - # - # # Perform a UNION between the dbSparseMatrix table and the new table with missing combinations - # sql <- paste( - # " - # CREATE TABLE staged AS - # SELECT i, j, x FROM", remote_name, " - # UNION ALL - # SELECT i, j, 0 AS x FROM missing_combinations - # " - # ) - # - # DBI::dbExecute(conn = con, - # statement = sql) - # - # # Remove the temporary tables - # DBI::dbRemoveTable(conn = con, name = "missing_combinations") - # - # # Remove old table - # DBI::dbExecute(conn = con, paste0("DROP VIEW IF EXISTS ", remote_name)) - # - # # Rename staged to new remote_name table - # rename_sql <- paste("ALTER TABLE staged RENAME TO", remote_name) - # data <- DBI::dbExecute(conn = con, statement = rename_sql) - # - # # Create new dbSparseMatrix object - # db_sparse <- new("dbSparseMatrix", - # data = db_dense@data, - # hash = db_dense@hash, - # remote_name = remote_name, - # dims = dims, - # dim_names = db_dense@dim_names) - # # show - # db_sparse + stopf("Not yet supported") } -# create ijx vector representation of sparse matrix, keeping zeros +# Create ijx vector representation of sparse matrix, keeping zeros # Updates dgcmatrix by reference # Copied from below: # https://stackoverflow.com/questions/64473488/melting-a-sparse-matrix-dgcmatrix-and-keeping-its-zeros @@ -787,55 +583,56 @@ get_dense_ijx_dt <- function(x) { #' @return remote table in long format unpivoted from wide format matrix #' @keywords internal to_ijx_disk <- function(con, name){ - - # add row idx to ingested matrix - # TODO: do this without creating a new table - query <- glue::glue( - "CREATE TABLE new_table AS SELECT ROW_NUMBER() OVER () AS row_index, * FROM {name};", - "DROP TABLE {name};", - "ALTER TABLE new_table RENAME TO {name};" - ) - invisible(DBI::dbExecute(con, query)) - - # create ijx from wide format - query <- glue::glue("CREATE TABLE ijx AS UNPIVOT {name} ON COLUMNS(* EXCLUDE (row_index));", - "DROP TABLE {name};", - "ALTER TABLE ijx RENAME TO {name};") - invisible(DBI::dbExecute(con, query)) - - # rename column names - query <- glue::glue( - "ALTER TABLE {name} RENAME COLUMN row_index TO i;", - "ALTER TABLE {name} RENAME COLUMN name TO j;", - "ALTER TABLE {name} RENAME COLUMN value TO x;", - ) - invisible(DBI::dbExecute(con, query)) - - # remove char from j column - # TODO: fix the j column data type. still stuck on after below runs - query <- glue::glue("UPDATE {name} SET j = CAST(REPLACE(j, 'V', '') AS DOUBLE);") - invisible(DBI::dbExecute(con, query)) - - res <- dplyr::tbl(con, name) - - return(res) + stopf("Not yet supported") + + # # add row idx to ingested matrix + # # TODO: do this without creating a new table + # query <- glue::glue( + # "CREATE TABLE new_table AS SELECT ROW_NUMBER() OVER () AS row_index, * FROM {name};", + # "DROP TABLE {name};", + # "ALTER TABLE new_table RENAME TO {name};" + # ) + # invisible(DBI::dbExecute(con, query)) + # + # # create ijx from wide format + # query <- glue::glue("CREATE TABLE ijx AS UNPIVOT {name} ON COLUMNS(* EXCLUDE (row_index));", + # "DROP TABLE {name};", + # "ALTER TABLE ijx RENAME TO {name};") + # invisible(DBI::dbExecute(con, query)) + # + # # rename column names + # query <- glue::glue( + # "ALTER TABLE {name} RENAME COLUMN row_index TO i;", + # "ALTER TABLE {name} RENAME COLUMN name TO j;", + # "ALTER TABLE {name} RENAME COLUMN value TO x;", + # ) + # invisible(DBI::dbExecute(con, query)) + # + # # remove char from j column + # # TODO: fix the j column data type. still stuck on after below runs + # query <- glue::glue("UPDATE {name} SET j = CAST(REPLACE(j, 'V', '') AS DOUBLE);") + # invisible(DBI::dbExecute(con, query)) + # + # res <- dplyr::tbl(con, name) + # + # return(res) } #' as_matrix #' #' @param x dbSparseMatrix +#' @param output "matrix" +#' @description +#' Set output to matrix to cast dbSparseMatrix into matrix +#' #' @details #' this is a helper function to convert dbMatrix to dgCMatrix or matrix -#' Warning: this fn can lead memory issue if the dbMatrix is large +#' Warning: can cause memory issues if the input matrix is large #' #' #' @return dgCMatrix or matrix #' @noRd -as_matrix <- function(x){ - # check that x is a dbSparseMatrix - if(!inherits(x = x, what = "dbSparseMatrix")){ - stop("Invalid input. Only dbSparseMatrix is currently supported.") - } +as_matrix <- function(x, output){ con <- dbplyr::remote_con(x[]) .check_con(con) dims <- dim(x) @@ -849,40 +646,75 @@ as_matrix <- function(x){ "Warning: Converting large dbMatrix to in-memory Matrix.") } - max_i <- x[] |> dplyr::summarise(max_i = max(i)) |> dplyr::pull(max_i) - max_j <- x[] |> dplyr::summarise(max_j = max(j)) |> dplyr::pull(max_j) + if(class(x) == "dbSparseMatrix"){ + max_i <- x[] |> dplyr::summarise(max_i = max(i)) |> dplyr::pull(max_i) + max_j <- x[] |> dplyr::summarise(max_j = max(j)) |> dplyr::pull(max_j) - temp_file <- tempfile(tmpdir = getwd(), fileext = ".parquet") + temp_file <- tempfile(tmpdir = getwd(), fileext = ".parquet") - if (max_i == n_rows & max_j == n_cols){ - x[] |> arrow::to_arrow() |> arrow::write_parquet(temp_file) - } else { - # Generate i and j vectors from scratch - sql_i <- glue::glue("SELECT i FROM generate_series(1, {n_rows}) AS t(i)") - sequence_i <- dplyr::tbl(con, dplyr::sql(sql_i)) - - sql_j <- glue::glue("SELECT j FROM generate_series(1, {n_cols}) AS t(j)") - sequence_j <- dplyr::tbl(con, dplyr::sql(sql_j)) + if (max_i == n_rows & max_j == n_cols){ + x[] |> + arrow::to_arrow() |> + arrow::write_parquet(temp_file) + } else { + # Generate i and j vectors from scratch + sql_i <- glue::glue("SELECT i FROM generate_series(1, {n_rows}) AS t(i)") + sequence_i <- dplyr::tbl(con, dplyr::sql(sql_i)) + + sql_j <- glue::glue("SELECT j FROM generate_series(1, {n_cols}) AS t(j)") + sequence_j <- dplyr::tbl(con, dplyr::sql(sql_j)) + + key <- sequence_i |> + dplyr::cross_join(sequence_j) |> + dplyr::mutate(x = 0) + + # TODO: skip arrow conversion and write straight to parquet? + key |> + dplyr::left_join(x[], by = c("i", "j"), suffix = c("", ".dgc")) |> + dplyr::mutate(x = ifelse(is.na(x.dgc), x, x.dgc)) |> + dplyr::select(-x.dgc) |> + arrow::to_arrow() |> + arrow::write_parquet(temp_file) + } - key <- sequence_i |> - dplyr::cross_join(sequence_j) |> - dplyr::mutate(x = 0) + # Create mat + dt <- arrow::read_parquet(temp_file) + mat <- Matrix::sparseMatrix(i = dt$i , j = dt$j , x = dt$x, index1 = TRUE) + mat <- Matrix::drop0(mat) + dimnames(mat) = dim_names + dim(mat) = dims + unlink(temp_file, recursive = TRUE, force = TRUE) + if(!missing(output) && output == "matrix"){ + mat <- as.matrix(mat) + } + } + else if(class(x) == "dbDenseMatrix"){ + # Create a temporary file to store the matrix + temp_file <- tempfile(tmpdir = getwd(), fileext = ".parquet") - key |> - dplyr::left_join(x[], by = c("i", "j"), suffix = c("", ".dgc")) |> - dplyr::mutate(x = ifelse(is.na(x.dgc), x, x.dgc)) |> - dplyr::select(-x.dgc) |> + # TODO: skip arrow conversion and write straight to parquet? + x[] |> arrow::to_arrow() |> arrow::write_parquet(temp_file) - } - # Create mat - dt <- arrow::read_parquet(temp_file) - mat <- Matrix::sparseMatrix(i = dt$i , j = dt$j , x = dt$x, index1 = TRUE) - mat <- Matrix::drop0(mat) - dimnames(mat) = dim_names - dim(mat) = dims - unlink(temp_file) + dt <- arrow::read_parquet(temp_file) + + # Create a sparse matrix + mat <- Matrix::sparseMatrix( + i = dt$i, + j = dt$j, + x = dt$x, + index1 = TRUE + ) + dimnames(mat) = dim_names + dim(mat) = dims + + # Convert sparse matrix to dense matrix in-memory + mat <- as.matrix(mat) + + # Clean up temp files + unlink(temp_file, recursive = TRUE, force = TRUE) + } return(mat) } @@ -891,15 +723,15 @@ as_matrix <- function(x){ #' @param x dgCMatrix or matrix #' @noRd as_ijx <- function(x){ - # check that x is a dgCMatrix or matrix - stopifnot(is(x, "dgCMatrix") || is(x, "matrix")) + # check that x is a dgCMatrix, matrix, or dgeMatrix + stopifnot(inherits(x, "dgCMatrix") || inherits(x, "matrix") || inherits(x, "dgeMatrix")) # Convert dgc into TsparseMatrix class from {Matrix} ijx <- as(x, "TsparseMatrix") # Get dbMatrix in triplet vector format (TSparseMatrix) # Convert to 1-based indexing - df = data.frame(i = ijx@i + 1, j = ijx@j + 1, x = ijx@x) + df = data.table::data.table(i = ijx@i + 1, j = ijx@j + 1, x = ijx@x) return(df) } @@ -915,7 +747,7 @@ as_ijx <- function(x){ #' @param name table name to assign within database \code{(required, default: "dbMatrix")} #' @param overwrite whether to overwrite if table already exists in database \code{(required)} #' -#' @return dbMatrix object +#' @return `dbMatrix` object #' @keywords internal dbMatrix_from_tbl <- function(tbl, rownames_colName, @@ -923,7 +755,6 @@ dbMatrix_from_tbl <- function(tbl, name = "dbMatrix", overwrite = FALSE){ # Check args - # TODO: update to proper tbl object check con = dbplyr::remote_con(tbl) .check_con(con) .check_tbl(tbl) @@ -945,13 +776,13 @@ dbMatrix_from_tbl <- function(tbl, if(name %in% DBI::dbListTables(con) & !overwrite){ stop("name already exists in the database. - Please choose a unique name.") + Please choose a unique name or set overwrite to 'TRUE'.") } # check if i and j are column names check_names = intersect(c(colnames(tbl), as.character(rownames_colName), - as.character(colnames_colName)), - c("i", "j")) + as.character(colnames_colName)), + c("i", "j")) if(length(check_names)>0) { stop("i and j are reserved names for matrix dimensions. please choose new column names") @@ -1004,7 +835,7 @@ dbMatrix_from_tbl <- function(tbl, ijx <- count_table |> dplyr::select(i = i_encoded, j = j_encoded, x) |> - dplyr::compute(name = name, overwrite = overwrite) # save to db + dplyr::compute(name = name, overwrite = overwrite, temporary = FALSE) # set metadata dims = c(dim_i, dim_j) @@ -1018,62 +849,11 @@ dbMatrix_from_tbl <- function(tbl, dims = dims) return(res) - - # # Create 'index' with distinct 'genes' and 'id' - # index <- count_table |> - # dplyr::distinct(rownames_colName, colnames_colName) - # - # # map unique integer to each gene and cellID for dbMatrix creation - # # Note: window_order is necessary to have reproducible j assignment - # tbl_i <- index |> - # dplyr::distinct(rownames_colName) |> - # dbplyr::window_order(rownames_colName) |> - # dplyr::mutate(i = dplyr::row_number()) - # - # tbl_j <- index |> - # dplyr::distinct(colnames_colName) |> - # dbplyr::window_order(colnames_colName) |> - # dplyr::mutate(j = dplyr::row_number()) - # - # ijx <- count_table |> - # dplyr::inner_join(tbl_i, by = as.character(rownames_colName)) |> - # dplyr::inner_join(tbl_j, by = as.character(colnames_colName)) |> - # dplyr::select(i, j, x) |> - # dplyr::compute(name = name, overwrite = overwrite) - - #dplyr::compute(temporary = FALSE, name = name, overwrite = TRUE) - - # Note: dim must be less than 2^31 int32 limit - # dim_i = tbl_i |> dplyr::tally() |> dplyr::pull(n) |> as.integer() - # dim_j = tbl_j |> dplyr::tally() |> dplyr::pull(n) |> as.integer() - # dims = c(dim_i, dim_j) - # - # # Note: factor in dbMatrix constructor - # row_names = tbl_i |> dplyr::pull(rownames_colName) - # col_names = tbl_j |> dplyr::pull(colnames_colName) - # dim_names = list(row_names, col_names) - - # Pass ijx to the dbMatrix constructor - # res <- dbMatrix::dbMatrix(value = ijx, - # class = "dbSparseMatrix", - # con = con, - # name = name, - # dims = dims, - # dim_names = dim_names, - # overwrite = overwrite) - # TODO: provide option for dbDenseMatrix - - # res <- new(Class = "dbSparseMatrix", - # value = ijx, - # name = name, - # init = TRUE, - # dim_names = dim_names, - # dims = dims) } # readers #### #' read_matrix -#' @description Read tabular matrix files into database +#' @description Ingest tabular matrix files into database #' @details #' Construct a database VIEW of a .csv, .tsv, or .txt files or their .gz/.gzip #' variants @@ -1086,7 +866,8 @@ dbMatrix_from_tbl <- function(tbl, #' @param ... additional params to pass #' #' @return tbl_dbi object -#' @export +#' @noRd +#' @keywords internal #' #' @examples #' print('TODO') @@ -1150,7 +931,8 @@ read_matrix <- function(con, #' @param ... additional params to pass #' #' @return tbl_dbi object -#' @export +#' @noRd +#' @keywords internal #' #' @examples #' print('TODO') From f12f0094b3dfd4ad43853f4e1d95326602d3f49a Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:37:14 -0700 Subject: [PATCH 18/27] refactor: extracts to hide temp tables in arrow schema --- R/extract.R | 226 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 156 insertions(+), 70 deletions(-) diff --git a/R/extract.R b/R/extract.R index 7ab68a2..ca13bd8 100644 --- a/R/extract.R +++ b/R/extract.R @@ -2,6 +2,7 @@ ## Empty #### ### Extract [] #### #' @rdname hidden_aliases +#' @concept dbMatrix #' @export setMethod('[', signature(x = 'dbMatrix', i = 'missing', j = 'missing', drop = 'missing'), function(x, i, j) { @@ -11,6 +12,7 @@ setMethod('[', signature(x = 'dbMatrix', i = 'missing', j = 'missing', drop = 'm ### Set [] #### # no initialize to prevent slowdown #' @rdname hidden_aliases +#' @concept dbMatrix #' @export setMethod('[<-', signature(x = 'dbMatrix', i = 'missing', j = 'missing', value = 'ANY'), function(x, i, j, value) { @@ -21,28 +23,46 @@ setMethod('[<-', signature(x = 'dbMatrix', i = 'missing', j = 'missing', value = # dbMatrix #### ## rows only #### #' @rdname hidden_aliases +#' @concept dbMatrix #' @export -setMethod('[', signature(x = 'dbMatrix', i = 'dbIndex', j = 'missing', drop = 'missing'), +setMethod('[', + signature( + x = 'dbMatrix', + i = 'dbIndex', + j = 'missing', + drop = 'missing' + ), function(x, i, ...) { # get dbMatrix info con = get_con(x) - tbl_name = get_tblName(x) + dim = dim(x) + + # check inputs + .check_extract(x = x, i = i, j = NULL, dim = dim) # create mapping of filtered rownames to row index - # note: https://duckdb.org/docs/sql/statements/create_sequence.html map = data.frame(i = seq_along(rownames(x)), rowname = rownames(x)) - filter_i = get_dbM_sub_i(index = i, dbM_dimnames = x@dim_names) + filter_i = get_dbM_sub_idx(index = i, + dbM_dimnames = x@dim_names, dims = 1) map = map |> dplyr::filter(rowname %in% filter_i) |> dplyr::mutate(new_i = seq_along(filter_i)) # reset index # send map to db for subsetting - # TODO: implement unique naming of temp tables - map_temp <- dplyr::copy_to(dest = con, - df = map, - name = 'map_temp_i', - overwrite = TRUE, - temporary = TRUE) + name = unique_table_name('temp_i') + + # FIXME: workaround for lack of support for writing + # tables to a custom schema that is invisible to the user + # i.e. not present in DBI::dbListTables(con) + # + # To see the tables in the arrow schema, + # use duckdb::duckdb_list_arrow(conn = con) + map_temp <- arrow::to_duckdb( + .data = map, # converts to arrow-compliant object + con = con, + table_name = name, + auto_disconnect = TRUE # remove tbl when gc + ) # subset dbMatrix x[] <- x[] |> @@ -52,70 +72,87 @@ setMethod('[', signature(x = 'dbMatrix', i = 'dbIndex', j = 'missing', drop = 'm # update dbMatrix attributes x@dim_names[[1L]] = filter_i - x@dims[1L] <- ifelse(is.logical(i), sum(i), length(i)) + x@dims[1L] <- length(filter_i) return(x) }) ## cols only #### #' @rdname hidden_aliases +#' @concept dbMatrix #' @export -setMethod('[', signature(x = 'dbMatrix', i = 'missing', j = 'dbIndex', drop = 'missing'), +setMethod('[', + signature( + x = 'dbMatrix', + i = 'missing', + j = 'dbIndex', + drop = 'missing' + ), function(x, j, ...) { # get dbMatrix info - con = get_con(x) - tbl_name = get_tblName(x) + con <- get_con(x) + dim = dim(x) + + # check for dims + .check_extract(x = x, i = NULL, j = j, dim = dim) # create mapping of filtered colnames to col index - # note: https://duckdb.org/docs/sql/statements/create_sequence.html - map = data.frame(j = seq_along(colnames(x)), colname = colnames(x)) - filter_j = get_dbM_sub_j(index = j, dbM_dimnames = x@dim_names) - map = map |> + map <- data.frame(j = seq_along(colnames(x)), colname = colnames(x)) + filter_j <- get_dbM_sub_idx(index = j, + dbM_dimnames = x@dim_names, dims = 2) + map <- map |> dplyr::filter(colname %in% filter_j) |> dplyr::mutate(new_j = seq_along(filter_j)) # reset index # send map to db for subsetting - # TODO: implement unique table name - duckdb::dbWriteTable(conn = con, - name = 'map_temp_j', - overwrite = TRUE, - value = map, - temporary = TRUE) - map_temp <- dplyr::tbl(con, "map_temp_j") + name <- unique_table_name('temp_j') - # subset dbMatrix + # FIXME: + map_temp <- arrow::to_duckdb( + .data = map, # converts to arrow-compliant object + con = con, + table_name = name, + auto_disconnect = TRUE # remove tbl when gc + ) + + # Subset with arrow virtual table x[] <- x[] |> dplyr::filter(j %in% !!map$j) |> dplyr::inner_join(map_temp, by = c("j" = "j")) |> dplyr::select(i, j = new_j, x) - # update dbMatrix attributes - x@dim_names[[2L]] = filter_j - x@dims[2L] <- ifelse(is.logical(j), sum(j), length(j)) + # Update dbMatrix attributes + x@dim_names[[2L]] <- filter_j + x@dims[2L] <- length(filter_j) return(x) }) ## rows and cols #### #' @rdname hidden_aliases +#' @concept dbMatrix #' @export setMethod('[', signature(x = 'dbMatrix', i = 'dbIndex', j = 'dbIndex', drop = 'missing'), function(x, i, j, ...) { # get dbMatrix info con = get_con(x) - tbl_name = get_tblName(x) + dim = dim(x) + + # check for dims + .check_extract(x = x, i = i, j = j, dim = dim) # create mapping of dim indices and dimnames map_i = data.frame(i = seq_along(rownames(x)), - rowname = rownames(x)) + rowname = rownames(x)) map_j = data.frame(j = seq_along(colnames(x)), colname = colnames(x)) # subset map by filtered dimnames - # note: https://duckdb.org/docs/sql/statements/create_sequence.html - filter_i = get_dbM_sub_i(index = i, dbM_dimnames = x@dim_names) - filter_j = get_dbM_sub_j(index = j, dbM_dimnames = x@dim_names) + filter_i = get_dbM_sub_idx(index = i, + dbM_dimnames = x@dim_names, dims = 1) + filter_j = get_dbM_sub_idx(index = j, + dbM_dimnames = x@dim_names, dims = 2) map_i = map_i |> dplyr::filter(rowname %in% filter_i) |> @@ -125,54 +162,103 @@ setMethod('[', signature(x = 'dbMatrix', i = 'dbIndex', j = 'dbIndex', drop = 'm dplyr::filter(colname %in% filter_j) |> dplyr::mutate(new_j = seq_along(filter_j)) # reset index - # subset dbMatrix j - # TODO: implement unique table name - duckdb::dbWriteTable(conn = con, - name = 'map_temp_ij_i', - overwrite = TRUE, - value = map_j, - temporary = TRUE) - map_temp <- dplyr::tbl(con, "map_temp_ij_i") + name = unique_table_name('temp_map_ij_i') + map_temp_j <- arrow::to_duckdb( + .data = map_j, # converts to arrow-compliant object + con = con, + table_name = name, + auto_disconnect = TRUE # remove tbl when gc + ) - x[] <- x[] |> - dplyr::filter(j %in% !!map_j$j) |> - dplyr::inner_join(map_temp, by = c("j" = "j")) |> - dplyr::select(i, j = new_j, x) - - # subset dbMatrix i - # TODO: implement unique table name - duckdb::dbWriteTable(conn = con, - name = 'map_temp_ij_j', - overwrite = TRUE, - value = map_i, - temporary = TRUE) - - map_temp <- dplyr::tbl(con, "map_temp_ij_j") + name = unique_table_name('temp_map_ij_j') + map_temp_i <- arrow::to_duckdb( + .data = map_i, # converts to arrow-compliant object + con = con, + table_name = name, + auto_disconnect = TRUE # remove tbl when gc + ) x[] <- x[] |> - dplyr::filter(i %in% !!map_i$i) |> - dplyr::inner_join(map_temp, by = c("i" = "i")) |> - dplyr::select(i=new_i, j , x) + dplyr::filter(i %in% !!map_i$i, j %in% !!map_j$j) |> + dplyr::inner_join(map_temp_i, by = c("i" = "i")) |> + dplyr::inner_join(map_temp_j, by = c("j" = "j")) |> + dplyr::select(i = new_i, j = new_j, x) # update dbMatrix attributes x@dim_names[[1L]] = filter_i x@dim_names[[2L]] = filter_j - x@dims[1L] <- ifelse(is.logical(i), sum(i), length(i)) - x@dims[2L] <- ifelse(is.logical(j), sum(j), length(j)) + x@dims[1L] <- length(filter_i) + x@dims[2L] <- length(filter_j) return(x) }) +#' @description +#' Internal function to index `dbMatrix` objects by `dbIndex` superclass. +#' Can apply to both rows (dims = 1) and columns (dims = 2) +#' @keywords internal +#' @noRd +get_dbM_sub_idx = function(index, dbM_dimnames, dims) { + # check that idx is 1 or 2 + if(dims != 1 && dims != 2){ + stop("dims must be 1 (rows) or 2 (columns)") + } + dims = as.integer(dims) + + if(is.character(index)){ + return(index) + } + if (is.logical(index)) { + index <- recycle_boolean_index(index, length(dbM_dimnames[[dims]])) + } + + sub_names = dbM_dimnames[[dims]] + return(sub_names[index]) +} + +#' @details +#' Recycles a logical index to the length of a vector. This is to emulate the +#' strange behavior of what occurs in R when indexing matrices with a logical +#' vector. +#' Note: +#' Recycle indexing behavior is unique to logic indexing, not numerical nor +#' character indexing for matrices. #' @noRd -get_dbM_sub_i = function(index, dbM_dimnames) { - if(is.character(index)) return(index) - i_names = dbM_dimnames[[1L]] - return(i_names[index]) +#' @keywords internal +recycle_boolean_index <- function(index, length) { + if (is.logical(index) && length(index) < length) { + recycled <- rep_len(index, length) + return(which(recycled)) + } + return(index) } #' @noRd -get_dbM_sub_j = function(index, dbM_dimnames) { - if(is.character(index)) return(index) - j_names = dbM_dimnames[[2L]] - return(j_names[index]) -} \ No newline at end of file +#' @keywords internal +.check_extract <- function(x = x, i = NULL, j = NULL, dim){ + if (!is.null(j)) { + if (is.numeric(j) & max(j) > dim[2]) { + stopf("Index exceeds column dimension of", dim[2]) + } + else if(is.character(j) & !all(j %in% colnames(x))) { + missing_cols <- j[!j %in% colnames(x)] + stopf("Column(s) not found in dbMatrix: \n", missing_cols) + } + else if (is.logical(j) & length(j) > dim[2]) { + stopf("Index exceeds column dimension of", dim[2]) + } + } + if (!is.null(i)) { + if (is.numeric(i) & max(i) > dim[1]) { + stopf("Index exceeds row dimension of", dim[1]) + } + else if(is.character(i) & !all(i %in% colnames(x))) { + missing_cols <- i[!i %in% rownames(x)] + stopf("Row(s) not found in dbMatrix: \n", missing_cols) + } + else if (is.logical(i) & length(i) > dim[1]) { + stopf("Index exceeds row dimension of", dim[1]) + } + } + +} From 188d0748d504892172909c98276201674b5f9c33 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:37:55 -0700 Subject: [PATCH 19/27] chore: update generics --- TODO: add imported generics with accompanying roxygen --- R/generics.R | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/R/generics.R b/R/generics.R index e36a6ca..e1f6b4c 100644 --- a/R/generics.R +++ b/R/generics.R @@ -2,20 +2,30 @@ setGeneric('colTypes', function(x, ...) standardGeneric('colTypes')) setGeneric('castNumeric', function(x, col, ...) standardGeneric('castNumeric')) +#' @importFrom MatrixGenerics colMeans colSums rowMeans rowSums colSds rowSds +NULL + +.onLoad <- function(libname, pkgname) { + if (!isGeneric("rownames")) methods::setGeneric("rownames") + if (!isGeneric("rownames<-")) methods::setGeneric("rownames<-") + if (!isGeneric("colnames")) methods::setGeneric("colnames") + if (!isGeneric("colnames<-")) methods::setGeneric("colnames<-") + if (!isGeneric("nrow")) methods::setGeneric("nrow") + if (!isGeneric("ncol")) methods::setGeneric("ncol") +} + # dbMatrix specific #### -setGeneric('colSds', function(x, ...) standardGeneric('colSds')) -setGeneric('colMeans', function(x, ...) standardGeneric('colMeans')) -setGeneric('colSums', function(x, ...) standardGeneric('colSums')) -setGeneric('rowSds', function(x, ...) standardGeneric('rowSds')) -setGeneric('rowMeans', function(x, ...) standardGeneric('rowMeans')) -setGeneric('rowSums', function(x, ...) standardGeneric('rowSums')) +# setGeneric('colSds', function(x, ...) standardGeneric('colSds')) +# setGeneric('colMeans', function(x, ...) standardGeneric('colMeans')) +# setGeneric('colSums', function(x, ...) standardGeneric('colSums')) +# setGeneric('rowSds', function(x, ...) standardGeneric('rowSds')) +# setGeneric('rowMeans', function(x, ...) standardGeneric('rowMeans')) +# setGeneric('rowSums', function(x, ...) standardGeneric('rowSums')) # dbData ops #### -setGeneric('t', function(x, ...) standardGeneric('t')) -setGeneric('mean', function(x, ...) standardGeneric('mean')) +# setGeneric('t', function(x, ...) standardGeneric('t')) +# setGeneric('mean', function(x, ...) standardGeneric('mean')) # DBI #### -setGeneric('dbDisconnect', function(x, ...) standardGeneric('dbDisconnect')) -setGeneric('dbListTables', function(x, ...) standardGeneric('dbListTables')) - - +# setGeneric('dbDisconnect', function(x, ...) standardGeneric('dbDisconnect')) +# setGeneric('dbListTables', function(x, ...) standardGeneric('dbListTables')) From 6290732f10f3fae3a61f6169171dd848e2e6ded1 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:38:58 -0700 Subject: [PATCH 20/27] fix: arith,ops calls calls --- TODO: - see tests for failing arith, ops - need to formalize NaN, div by 0 --- R/operations.R | 354 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 233 insertions(+), 121 deletions(-) diff --git a/R/operations.R b/R/operations.R index c290e73..a57fa4b 100644 --- a/R/operations.R +++ b/R/operations.R @@ -24,8 +24,8 @@ arith_call_dbm = function(dbm_narg, dbm, num_vect, generic_char) { if (length(num_vect) > 1L) return(arith_call_dbm_vect_multi(dbm, num_vect, generic_char, ordered_args)) - build_call = - paste0('dbm[] |> dplyr::mutate(x = `', generic_char, ordered_args) + build_call = paste0('dbm[] |> dplyr::mutate(x = `', + generic_char, ordered_args) dbm[] = eval(str2lang(build_call)) dbm @@ -33,41 +33,75 @@ arith_call_dbm = function(dbm_narg, dbm, num_vect, generic_char) { #' @noRd arith_call_dbm_vect_multi = function(dbm, num_vect, generic_char, ordered_args) { + dim = dim(dbm) + mat = .recycle_vector_to_matrix(num_vect, dim) + vect_tbl = as_ijx(mat) - # handle dimnames - r_names = rownames(dbm) - # if (is.factor(r_names)) { - # r_names = 1:length(r_names) - # } - - # perform matching of vect by rownames on dbm - vect_tbl = dplyr::tibble(i = match(names(num_vect), r_names), - num_vect = unname(num_vect[match(names(num_vect), - r_names)])) + ordered_args <- if (ordered_args == '`(x, num_vect))') { + '`(x.x, x.y))' + } else { + '`(x.y, x.x))' + } - # run dplyr chain - build_call = paste0( + build_call <- glue::glue( 'dbm[] |> ', - 'dplyr::inner_join(vect_tbl, by = \'i\', copy = TRUE) |> ', - 'dplyr::mutate(x = `', - generic_char, - ordered_args, - ' |> ', - 'dplyr::select(i, j, x)' + 'dplyr::full_join(vect_tbl, by = c("i", "j"), copy = TRUE) |> ', + 'dplyr::mutate(', + 'x.x = coalesce(x.x, 0), ', + 'x.y = coalesce(x.y, 0), ', + 'x = `', generic_char, ordered_args,' |> ', + 'dplyr::select(i, j, x) |> ', + 'dplyr::filter(x != 0)' ) + # } + dbm[] = eval(str2lang(build_call)) # show return(dbm) } +#' @noRd +.recycle_vector_to_matrix <- function(vec, dimensions) { + if (length(vec) == 0) { + return(matrix(0, nrow = dimensions[1], ncol = dimensions[2])) + } + + # Recycle the vector to match the total number of elements in the matrix + recycled_vec <- rep(vec, length.out = prod(dimensions)) + + # Create the matrix column-wise + mat <- matrix( + recycled_vec, + nrow = dimensions[1], + ncol = dimensions[2], + byrow = FALSE # This ensures column-wise filling + ) + + # Throw warning if length of vec is not a multiple of dimensions[2] + # if (length(vec) %% dimensions[2] != 0) { + # warning('longer object length is not a multiple of shorter object length', + # call. = FALSE) + # } + + return(mat) +} + # Math Ops #### ## Arith: dbm_e2 #### -#' @rdname hidden_aliases +#' Arith dbMatrix, e2 +#' @description +#' See ?\link{\code{methods::Arith}} for more details. +#' @noRd +#' @rdname summary #' @export setMethod('Arith', signature(e1 = 'dbMatrix', e2 = 'ANY'), function(e1, e2) { + if (any(e2 == 0) && as.character(.Generic) %in% c('/', '^', '%%', '%/%')) { + stopf("Arith operations with '/', '^', '%%', '%/%' containing zero values are not yet supported for dbMatrix objects.") + } + dbm = castNumeric(e1) num_vect = if(typeof(e2) != 'double'){ @@ -91,9 +125,17 @@ setMethod('Arith', signature(e1 = 'dbMatrix', e2 = 'ANY'), function(e1, e2) { }) ## Arith: e1_dbm #### -#' @rdname hidden_aliases +#' Arith e1, dbMatrix +#' @description +#' See ?\link{\code{methods::Arith}} for more details. +#' @noRd +#' @rdname summary #' @export -setMethod('Arith', signature(e1 = 'ANY', e2 = 'dbMatrix'), function(e1, e2) { +setMethod('Arith', signature(e1 = 'ANY', e2 = 'dbMatrix'), + function(e1, e2) { + if (any(e1 == 0) && as.character(.Generic) %in% c('/', '^', '%%', '%/%')) { + stopf("Arith operations with '/', '^', '%%', '%/%' containing zero values are not yet supported for dbMatrix objects.") + } dbm = castNumeric(e2) num_vect = if (typeof(e1) != 'double'){ @@ -103,7 +145,7 @@ setMethod('Arith', signature(e1 = 'ANY', e2 = 'dbMatrix'), function(e1, e2) { } # Only densify if not 0 and if op is + or - - if (class(dbm) == 'dbSparseMatrix' && e1 != 0 && as.character(.Generic) %in% c('-', '+')) { + if (class(dbm) == 'dbSparseMatrix' && any(e1 != 0) && as.character(.Generic) %in% c('-', '+')) { dbm = toDbDense(dbm) } @@ -116,76 +158,92 @@ setMethod('Arith', signature(e1 = 'ANY', e2 = 'dbMatrix'), function(e1, e2) { }) ## Arith: dbm_dbm #### -#' @rdname hidden_aliases +#' Arith dbMatrix, e2 +#' @description +#' See ?\link{\code{methods::Arith}} for more details. +#' @noRd +#' @rdname summary #' @export -setMethod('Arith', signature(e1 = 'dbMatrix', e2 = 'dbMatrix'), function(e1, e2) -{ - if (!identical(e1@dims, e2@dims)) +setMethod('Arith', signature(e1 = 'dbMatrix', e2 = 'dbMatrix'), + function(e1, e2){ + if (!identical(e1@dims, e2@dims)) { stopf('non-conformable arrays') + } + generic_char = as.character(.Generic) + + if(generic_char %in% c('-','/', '^', '%%', '%/%')){ + stopf("Arith operations with '-', '/', '^', '%%', '%/%' are not yet supported between dbMatrix objects.") + } e1 = castNumeric(e1) e2 = castNumeric(e2) - build_call = str2lang( - paste0( - "e1[] |> - dplyr::left_join(e2[], by = c('i', 'j'), suffix = c('', '.y'), copy = TRUE) |> - dplyr::mutate(x = `", - as.character(.Generic), - "`(x, x.y)) |> - dplyr::select(c('i', 'j', 'x'))" - ) + build_call = glue::glue( + "e1[] |> + dplyr::left_join(e2[], by = c('i', 'j')) |> + dplyr::mutate(x = `{generic_char}`(x.x, x.y)) |> + dplyr::select(i, j, x)" ) - e1[] = eval(build_call) + e1[] = eval(str2lang(build_call)) e1 }) ## Ops: dbm_e2 #### -#' @rdname hidden_aliases +#' Ops dbMatrix, e2 +#' @description +#' See ?\link{\code{methods::Ops}} for more details. +#' @noRd +#' @rdname summary #' @export setMethod('Ops', signature(e1 = 'dbMatrix', e2 = 'ANY'), function(e1, e2) { - # e1 = reconnect(e1) + browser() - build_call = str2lang(paste0( + build_call = glue::glue( 'e1[] |> dplyr::mutate(x = `', as.character(.Generic) , '`(x, e2))' - )) - e1[] = eval(build_call) + ) + e1[] = eval(str2lang(build_call)) e1 }) ## Ops: e1_dbm #### -#' @rdname hidden_aliases +#' Ops e1, dbMatrix +#' @description +#' See ?\link{\code{methods::Ops}} for more details. +#' @noRd +#' @rdname summary #' @export setMethod('Ops', signature(e1 = 'ANY', e2 = 'dbMatrix'), function(e1, e2) { # e2 = reconnect(e2) - build_call = str2lang(paste0( + build_call = glue::glue( 'e2[] |> dplyr::mutate(x = `', as.character(.Generic) , '`(e1, x))' - )) - e2[] = eval(build_call) + ) + e2[] = eval(str2lang(build_call)) e2 }) ## Ops: dbm_dbm #### -#' @rdname hidden_aliases +#' Ops dbMatrix, dbMatrix +#' @description +#' See ?\link{\code{methods::Ops}} for more details. +#' @noRd +#' @rdname summary #' @export -setMethod('Ops', signature(e1 = 'dbMatrix', e2 = 'dbMatrix'), function(e1, e2) -{ +setMethod('Ops', signature(e1 = 'dbMatrix', e2 = 'dbMatrix'), function(e1, e2) { if (!identical(e1@dims, e2@dims)){ stopf('non-conformable arrays') } - build_call = str2lang( - paste0( + build_call = glue::glue( "e1[] |> dplyr::left_join(e2[], by = c('i', 'j'), suffix = c('', '.y')) |> dplyr::mutate(x = `", @@ -193,17 +251,17 @@ setMethod('Ops', signature(e1 = 'dbMatrix', e2 = 'dbMatrix'), function(e1, e2) "`(x, x.y)) |> dplyr::select(c('i', 'j', 'x'))" ) - ) - e1[] = eval(build_call) - # print(e1[]) + e1[] = eval(str2lang(build_call)) e1 }) # Math Summary Ops #### -## rowSums #### - -#' @title rowSums -#' @rdname hidden_aliases +## rowSums dbdm #### +#' Form Row and Column Sums and Means +#' @description +#' See ?\link{\code{base::rowSums}} for more details. +#' @concept summary +#' @rdname row_col_sums_means #' @export setMethod('rowSums', signature(x = 'dbDenseMatrix'), function(x, ...){ @@ -227,8 +285,12 @@ setMethod('rowSums', signature(x = 'dbDenseMatrix'), } ) -#' @title rowSums -#' @rdname hidden_aliases +## rowSums dbsm #### +#' Form Row and Column Sums and Means +#' @description +#' See ?\link{\code{base::rowSums}} for more details. +#' @concept summary +#' @rdname row_col_sums_means #' @export setMethod('rowSums', signature(x = 'dbSparseMatrix'), function(x, ...){ @@ -269,10 +331,13 @@ setMethod('rowSums', signature(x = 'dbSparseMatrix'), }) -## colSums #### +## colSums dbdm#### -#' @title colSums -#' @rdname hidden_aliases +#' Form Row and Column Sums and Means +#' @description +#' See ?\link{\code{base::colSums}} for more details. +#' @concept summary +#' @rdname row_col_sums_means #' @export setMethod('colSums', signature(x = 'dbDenseMatrix'), function(x, ...){ @@ -295,8 +360,12 @@ setMethod('colSums', signature(x = 'dbDenseMatrix'), vals }) -#' @title colSums -#' @rdname hidden_aliases +## colSums dbsm #### +#' Form Row and Column Sums and Means +#' @description +#' See ?\link{\code{base::colSums}} for more details. +#' @concept summary +#' @rdname row_col_sums_means #' @export setMethod('colSums', signature(x = 'dbSparseMatrix'), function(x, ...){ @@ -336,12 +405,13 @@ setMethod('colSums', signature(x = 'dbSparseMatrix'), res }) +## rowMeans dbdm #### - -## rowMeans #### - -#' @title rowMeans -#' @rdname hidden_aliases +#' Form Row and Column Sums and Means +#' @description +#' See ?\link{\code{base::rowMeans}} for more details. +#' @concept summary +#' @rdname row_col_sums_means #' @export setMethod('rowMeans', signature(x = 'dbDenseMatrix'), function(x, ...){ @@ -364,8 +434,12 @@ setMethod('rowMeans', signature(x = 'dbDenseMatrix'), vals }) -#' @title rowMeans -#' @rdname hidden_aliases +## rowMeans dbsm #### +#' Form Row and Column Sums and Means +#' @description +#' See ?\link{\code{base::rowMeans}} for more details. +#' @concept summary +#' @rdname row_col_sums_means #' @export setMethod('rowMeans', signature(x = 'dbSparseMatrix'), function(x, ...){ @@ -390,10 +464,12 @@ setMethod('rowMeans', signature(x = 'dbSparseMatrix'), vals }) -## colMeans #### - -#' @title colMeans -#' @rdname hidden_aliases +## colMeans dbdm#### +#' Form Row and Column Sums and Means +#' @description +#' See ?\link{\code{base::colMeans}} for more details. +#' @concept summary +#' @rdname row_col_sums_means #' @export setMethod('colMeans', signature(x = 'dbDenseMatrix'), function(x, ...){ @@ -413,8 +489,12 @@ setMethod('colMeans', signature(x = 'dbDenseMatrix'), vals }) -#' @title colMeans -#' @rdname hidden_aliases +## colMeans dbsm #### +#' Form Row and Column Sums and Means +#' @description +#' See ?\link{\code{base::colMeans}} for more details. +#' @concept summary +#' @rdname row_col_sums_means #' @export setMethod('colMeans', signature(x = 'dbSparseMatrix'), function(x, ...){ @@ -439,10 +519,13 @@ setMethod('colMeans', signature(x = 'dbSparseMatrix'), vals }) -## colSds #### +## colSds dbdm #### -#' @title colSds -#' @rdname hidden_aliases +#' Calculates the standard deviation for each row (column) of a matrix-like object +#' @description +#' See ?\link{\code{MatrixGenerics::colSds}} for more details. +#' @concept summary +#' @rdname sds #' @export setMethod('colSds', signature(x = 'dbDenseMatrix'), function(x, ...){ @@ -460,8 +543,12 @@ setMethod('colSds', signature(x = 'dbDenseMatrix'), vals }) -#' @title colSds -#' @rdname hidden_aliases +## colSds dbsm #### +#' Calculates the standard deviation for each row (column) of a matrix-like object +#' @description +#' See ?\link{\code{MatrixGenerics::colSds}} for more details. +#' @concept summary +#' @rdname sds #' @export setMethod('colSds', signature(x = 'dbSparseMatrix'), function(x, ...){ @@ -486,9 +573,12 @@ setMethod('colSds', signature(x = 'dbSparseMatrix'), # vals }) -## rowSds #### -#' @title rowSds -#' @rdname hidden_aliases +## rowSds dbdm#### +#' Calculates the standard deviation for each row (column) of a matrix-like object +#' @description +#' See ?\link{\code{MatrixGenerics::rowSds}} for more details. +#' @concept summary +#' @rdname sds #' @export setMethod('rowSds', signature(x = 'dbDenseMatrix'), function(x, ...){ @@ -506,8 +596,12 @@ setMethod('rowSds', signature(x = 'dbDenseMatrix'), vals }) -#' @title rowSds -#' @rdname hidden_aliases +## rowSds dbsm #### +#' Calculates the standard deviation for each row (column) of a matrix-like object +#' @description +#' See ?\link{\code{MatrixGenerics::rowSds}} for more details. +#' @concept summary +#' @rdname sds #' @export setMethod('rowSds', signature(x = 'dbSparseMatrix'), function(x, ...){ @@ -527,10 +621,13 @@ setMethod('rowSds', signature(x = 'dbSparseMatrix'), # vals }) -## mean #### +## mean dbdm#### -#' @title mean -#' @rdname hidden_aliases +#' Arithmetic Mean +#' @description +#' See ?\link{\code{base::mean}} for more details. +#' @concept summary +#' @rdname mean #' @export setMethod('mean', signature(x = 'dbDenseMatrix'), function(x, ...) { x = castNumeric(x) @@ -543,8 +640,12 @@ setMethod('mean', signature(x = 'dbDenseMatrix'), function(x, ...) { }) -#' @title mean -#' @rdname hidden_aliases +## mean dbsm#### +#' Arithmetic Mean +#' @description +#' See ?\link{\code{base::mean}} for more details. +#' @concept summary +#' @rdname mean #' @export setMethod('mean', signature(x = 'dbSparseMatrix'), function(x, ...) { x = castNumeric(x) @@ -564,8 +665,10 @@ setMethod('mean', signature(x = 'dbSparseMatrix'), function(x, ...) { ## log #### -#' @title log -#' @rdname hidden_aliases +#' Logarithms and Exponentials +#' @description +#' See ?\link{\code{base::log}} for more details. +#' @concept transform #' @export setMethod('log', signature(x = 'dbMatrix'), function(x, ...) { x = castNumeric(x) @@ -581,8 +684,10 @@ setMethod('log', signature(x = 'dbMatrix'), function(x, ...) { ### t #### -#' @title Transpose -#' @rdname hidden_aliases +#' Matrix Transpose +#' @description +#' See ?\link{\code{base::t}} for more details. +#' @concept transform #' @export setMethod('t', signature(x = 'dbMatrix'), function(x) { x[] = x[] |> dplyr::select(i = j, j = i, x) @@ -593,12 +698,11 @@ setMethod('t', signature(x = 'dbMatrix'), function(x) { ### nrow #### -#' @name nrow -#' @title The number of rows/cols +#' The Number of Rows/Columns of an Array #' @description -#' \code{nrow} and \code{ncol} return the number of rows or columns present in -#' \code{x}. -#' @aliases ncol +#' See ?\link{\code{base::nrow}} for more details. +#' @concept matrix_props +#' @rdname nrow_ncol #' @export setMethod('nrow', signature(x = 'dbMatrix'), function(x) { # x = reconnect(x) @@ -616,8 +720,11 @@ setMethod('nrow', signature(x = 'dbMatrix'), function(x) { ### ncol #### -#' @title ncol -#' @rdname hidden_aliases +#' The Number of Rows/Columns of an Array +#' @description +#' See ?\link{\code{base::ncol}} for more details. +#' @concept matrix_props +#' @rdname nrow_ncol #' @export setMethod('ncol', signature(x = 'dbMatrix'), function(x) { # x = reconnect(x) @@ -635,8 +742,10 @@ setMethod('ncol', signature(x = 'dbMatrix'), function(x) { ### dim #### -#' @title dim -#' @rdname hidden_aliases +#' Dimensions of an Object +#' @description +#' See ?\link{\code{base::dim}} for more details. +#' @concept matrix_props #' @export setMethod('dim', signature(x = 'dbMatrix'), @@ -649,7 +758,11 @@ setMethod('dim', }) ### head #### -#' @title head +#' Return the First or Last Parts of an Object +#' @description +#' See ?\link{\code{utils::head}} for more details. +#' @concept matrix_props +#' @rdname head_tail #' @export setMethod('head', signature(x = 'dbMatrix'), function(x, n = 6L, ...) { n_subset = 1:n @@ -659,7 +772,11 @@ setMethod('head', signature(x = 'dbMatrix'), function(x, n = 6L, ...) { }) ### tail #### -#' @title tail +#' Return the First or Last Parts of an Object +#' @description +#' See ?\link{\code{utils::tail}} for more details. +#' @concept matrix_props +#' @rdname head_tail #' @export setMethod('tail', signature(x = 'dbMatrix'), function(x, n = 6L, ...) { n_subset = (x@dims[1L] - n):x@dims[1L] @@ -669,25 +786,19 @@ setMethod('tail', signature(x = 'dbMatrix'), function(x, n = 6L, ...) { }) # Column data types #### -# Due to how these functions will be commonly seen within other functions, a -# call to `reconnect()` is omitted. ## colTypes #### -#' @name colTypes -#' @title Column data types of dbData objects -#' @description -#' Get the column data types of objects that inherit from \code{'dbData'} -#' @param x dbData data object -#' @param ... additional params to pass +#' Return the column types of a dbMatrix object +#' @concept matrix_props +#' @rdname colTypes #' @export -setMethod('colTypes', signature(x = 'dbData'), function(x, ...) { +setMethod('colTypes', signature(x = 'dbMatrix'), function(x, ...) { vapply(data.table::as.data.table(head(slot(x, "value"), 1L)), typeof, character(1L)) }) ## castNumeric #### -#' @name castNumeric #' @title Set a column to numeric #' @description #' Sets a column to numeric after first checking the column data type. Does @@ -696,9 +807,10 @@ setMethod('colTypes', signature(x = 'dbData'), function(x, ...) { #' @param x dbData data object #' @param col column to cast to numeric #' @param ... additional params to pass -#' @export +#' @noRd +#' @keywords internal setMethod('castNumeric', - signature(x = 'dbData', col = 'character'), + signature(x = 'dbMatrix', col = 'character'), function(x, col, ...) { if (colTypes(x)[col] != 'double') { sym_col = dplyr::sym(col) From fba7d1d63694fe592a78d873ea8cc7ec5096c9c1 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:40:20 -0700 Subject: [PATCH 21/27] chore: update docs --- R/accessors.R | 2 +- R/classes.R | 20 ++--- R/names.R | 30 ++++--- man/DBI.Rd | 16 ---- man/castNumeric.Rd | 23 ------ man/colTypes.Rd | 16 ++-- man/dbDenseMatrix-class.Rd | 17 ---- man/dbMatrix.Rd | 53 ++++--------- man/dbMatrix_from_tbl.Rd | 2 +- man/dbSparseMatrix-class.Rd | 11 --- man/dim-dbMatrix-method.Rd | 12 +++ man/get_tblName.Rd | 2 +- man/head-dbMatrix-method.Rd | 11 --- man/head_tail.Rd | 17 ++++ man/hidden_aliases.Rd | 151 ------------------------------------ man/log-dbMatrix-method.Rd | 12 +++ man/mean.Rd | 17 ++++ man/nrow.Rd | 13 ---- man/nrow_ncol.Rd | 17 ++++ man/precompute.Rd | 9 ++- man/readMM.Rd | 42 ---------- man/read_matrix.Rd | 34 -------- man/row_col_sums_means.Rd | 47 +++++++++++ man/sds.Rd | 59 ++++++++++++++ man/t-dbMatrix-method.Rd | 12 +++ man/tail-dbMatrix-method.Rd | 11 --- man/toDbDense.Rd | 19 ----- 27 files changed, 256 insertions(+), 419 deletions(-) delete mode 100644 man/DBI.Rd delete mode 100644 man/castNumeric.Rd delete mode 100644 man/dbDenseMatrix-class.Rd delete mode 100644 man/dbSparseMatrix-class.Rd create mode 100644 man/dim-dbMatrix-method.Rd delete mode 100644 man/head-dbMatrix-method.Rd create mode 100644 man/head_tail.Rd delete mode 100644 man/hidden_aliases.Rd create mode 100644 man/log-dbMatrix-method.Rd create mode 100644 man/mean.Rd delete mode 100644 man/nrow.Rd create mode 100644 man/nrow_ncol.Rd delete mode 100644 man/readMM.Rd delete mode 100644 man/read_matrix.Rd create mode 100644 man/row_col_sums_means.Rd create mode 100644 man/sds.Rd create mode 100644 man/t-dbMatrix-method.Rd delete mode 100644 man/tail-dbMatrix-method.Rd delete mode 100644 man/toDbDense.Rd diff --git a/R/accessors.R b/R/accessors.R index 2a825f7..eff59a9 100644 --- a/R/accessors.R +++ b/R/accessors.R @@ -1,7 +1,7 @@ # internal functions #### #' get_tblName #' -#' @param dbData +#' @param dbMatrix #' #' @keywords internal get_tblName <- function(dbMatrix){ diff --git a/R/classes.R b/R/classes.R index 219fe47..9e6dd4e 100644 --- a/R/classes.R +++ b/R/classes.R @@ -1,6 +1,5 @@ # dbData #### -#' @name dbData #' @title dbData #' @description Base class for all db objects #' @slot value dplyr tbl that represents the database data @@ -24,12 +23,14 @@ setClass( ### dbMatrix #### -#' @title S4 dbMatrix class +#' @title S4 virtual class for `dbMatrix` #' @description -#' Representation of sparse matrices using an on-disk database. Each object +#' Representation of sparse and dense matrices in a database. Each object #' is used as a connection to a single table that exists within the database. +#' Inherits from `dbData`. #' @slot dim_names row [1] and col [2] names #' @slot dims dimensions of the matrix +#' @noRd #' @export dbMatrix = setClass( Class = 'dbMatrix', @@ -45,12 +46,12 @@ dbMatrix = setClass( ) #### dbDenseMatrix #### -#' @title S4 Class for dbDenseMatrix +#' @title S4 Class for `dbDenseMatrix` #' #' @description Representation of dense matrices using an on-disk database. -#' Inherits from dbMatrix. +#' Inherits from \link{dbMatrix}. #' -#' @slot data A dense ijx dataframe/tibble +#' @noRd #' @export dbDenseMatrix = setClass( Class = "dbDenseMatrix", @@ -61,8 +62,8 @@ dbDenseMatrix = setClass( #' @title S4 Class for dbSparseMatrix #' #' @description Representation of sparse matrices using an on-disk database. -#' Inherits from dbMatrix. -#' +#' Inherits from \link{dbMatrix.} +#' @noRd #' @export dbSparseMatrix = setClass( Class = "dbSparseMatrix", @@ -70,8 +71,7 @@ dbSparseMatrix = setClass( ) ## dbIndex #### -#' @title Virtual Class "dbIndex" - Simple Class for dbData indices -#' @name dbIndex +#' @title S4 virtual class - Simple Class for dbData indices #' @description #' This is a virtual class used for indices (in signatures) for indexing #' and sub-assignment of 'dbData' objects. Simple class union of 'logical', diff --git a/R/names.R b/R/names.R index f3894d3..8d1a506 100644 --- a/R/names.R +++ b/R/names.R @@ -2,21 +2,28 @@ # TODO ensure these match the row / col operations # rownames #### -#' @rdname hidden_aliases +#' @rdname matrix_props +#' @concept matrix_props #' @export setMethod('rownames', signature(x = 'dbMatrix'), function(x) { rownames(x@value) }) -#' @rdname hidden_aliases +#' @rdname matrix_props +#' @concept matrix_props #' @export setMethod('rownames', signature(x = 'dbMatrix'), function(x) { x@dim_names[[1]] }) -#' @rdname hidden_aliases +#' @rdname matrix_props +#' @concept matrix_props #' @export setMethod('rownames<-', signature(x = 'dbMatrix'), function(x, value) { + if(is.null(value)){ + stopf('rownames are required for dbMatrix objects') + } + if(x@dims[1] != length(value)){ stopf('length of rownames to set does not equal number of rows') } @@ -25,19 +32,22 @@ setMethod('rownames<-', signature(x = 'dbMatrix'), function(x, value) { }) # colnames #### -#' @rdname hidden_aliases +#' @rdname matrix_props +#' @concept matrix_props #' @export setMethod('colnames', signature(x = 'dbMatrix'), function(x) { colnames(x@value) }) -#' @rdname hidden_aliases +#' @rdname matrix_props +#' @concept matrix_props #' @export setMethod('colnames', signature(x = 'dbMatrix'), function(x) { x@dim_names[[2]] }) -#' @rdname hidden_aliases +#' @rdname matrix_props +#' @concept matrix_props #' @export setMethod('colnames<-', signature(x = 'dbMatrix'), function(x, value) { if(x@dims[2] != length(value)){ @@ -48,15 +58,17 @@ setMethod('colnames<-', signature(x = 'dbMatrix'), function(x, value) { x }) # dimnames #### -#' @rdname hidden_aliases +#' @rdname matrix_props +#' @concept matrix_props #' @export setMethod('dimnames', signature(x = 'dbMatrix'), function(x) { x@dim_names }) -#' @rdname hidden_aliases +#' @rdname matrix_props +#' @concept matrix_props #' @export setMethod('dimnames<-', signature(x = 'dbMatrix', value = 'list'), function(x, value) { x@dim_names = value x -}) \ No newline at end of file +}) diff --git a/man/DBI.Rd b/man/DBI.Rd deleted file mode 100644 index d090f12..0000000 --- a/man/DBI.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{dbDisconnect,dbMatrix-method} -\alias{dbDisconnect,dbMatrix-method} -\alias{dbListTables,dbMatrix-method} -\title{dbDisconnect} -\usage{ -\S4method{dbDisconnect}{dbMatrix}(x, ...) - -\S4method{dbListTables}{dbMatrix}(x, ...) -} -\description{ -dbDisconnect - -dbListTables -} diff --git a/man/castNumeric.Rd b/man/castNumeric.Rd deleted file mode 100644 index db1457a..0000000 --- a/man/castNumeric.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/operations.R -\name{castNumeric} -\alias{castNumeric} -\alias{castNumeric,dbMatrix,missing-method} -\title{Set a column to numeric} -\usage{ -\S4method{castNumeric}{dbData,character}(x, col, ...) - -\S4method{castNumeric}{dbMatrix,missing}(x, col, ...) -} -\arguments{ -\item{x}{dbData data object} - -\item{col}{column to cast to numeric} - -\item{...}{additional params to pass} -} -\description{ -Sets a column to numeric after first checking the column data type. Does -nothing if the column is already a \code{double} -This precaution is to avoid truncation of values. -} diff --git a/man/colTypes.Rd b/man/colTypes.Rd index 52a15ae..c4ea7c4 100644 --- a/man/colTypes.Rd +++ b/man/colTypes.Rd @@ -1,16 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/operations.R -\name{colTypes} -\alias{colTypes} -\title{Column data types of dbData objects} +\name{colTypes,dbMatrix-method} +\alias{colTypes,dbMatrix-method} +\title{Return the column types of a `dbMatrix` object} \usage{ -\S4method{colTypes}{dbData}(x, ...) -} -\arguments{ -\item{x}{dbData data object} - -\item{...}{additional params to pass} +\S4method{colTypes}{dbMatrix}(x, ...) } \description{ -Get the column data types of objects that inherit from \code{'dbData'} +Return the column types of a `dbMatrix` object } +\concept{matrix_props} diff --git a/man/dbDenseMatrix-class.Rd b/man/dbDenseMatrix-class.Rd deleted file mode 100644 index 356cb03..0000000 --- a/man/dbDenseMatrix-class.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/classes.R -\docType{class} -\name{dbDenseMatrix-class} -\alias{dbDenseMatrix-class} -\alias{dbDenseMatrix} -\title{S4 Class for dbDenseMatrix} -\description{ -Representation of dense matrices using an on-disk database. -Inherits from dbMatrix. -} -\section{Slots}{ - -\describe{ -\item{\code{data}}{A dense ijx dataframe/tibble} -}} - diff --git a/man/dbMatrix.Rd b/man/dbMatrix.Rd index 2b03de7..b2fbb9b 100644 --- a/man/dbMatrix.Rd +++ b/man/dbMatrix.Rd @@ -1,24 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/classes.R, R/dbMatrix.R +% Please edit documentation in R/dbMatrix.R \name{dbMatrix} \alias{dbMatrix} -\title{S4 dbMatrix class} +\title{Create a sparse or dense dbMatrix objects} \usage{ -dbMatrix( - value, - class = NULL, - con = NULL, - overwrite = FALSE, - name = "dbMatrix", - dims = NULL, - dim_names = NULL, - mtx_rowname_file_path, - mtx_rowname_col_idx = 1, - mtx_colname_file_path, - mtx_colname_col_idx = 1, - ... -) - dbMatrix( value, class = NULL, @@ -62,40 +47,32 @@ database. by default, no header is assumed. \code{(optional)}} \item{...}{additional params to pass} } \description{ -Representation of sparse matrices using an on-disk database. Each object -is used as a connection to a single table that exists within the database. - Create an S4 \code{dbMatrix} object in sparse or dense triplet vector format. } \details{ -This function reads in data into a pre-existing DuckDB database. Based -on the \code{name} and \code{db_path} a lazy connection is then made -downstream during \code{dbMatrix} initialization. - +This function reads in data into a pre-existing DuckDB database. Supported \code{value} data types: \itemize{ \item \code{dgCMatrix} In-memory sparse matrix from the \code{Matrix} package \item \code{dgTMatrix} In-memory triplet vector or COO matrix \item \code{matrix} In-memory dense matrix from base R - \item \code{.mtx} Path to .mtx file (TODO) - \item \code{.csv} Path to .csv file (TODO) + \item \code{.mtx} Path to .mtx file + \item \code{.csv} Path to .csv file \item \code{tbl_duckdb_connection} Table in DuckDB database in ijx format from existing \code{dbMatrix} object. \code{dims} and \code{dim_names} must be specified if \code{value} is \code{tbl_duckdb_connection}. } } -\section{Slots}{ - -\describe{ -\item{\code{dim_names}}{row [1] and col [2] names} - -\item{\code{dims}}{dimensions of the matrix} -}} - \examples{ -dgc <- dbMatrix:::sim_dgc() -dbSparse <- dbMatrix(value = dgc, db_path = ":memory:", - name = "sparse_matrix", class = "dbSparseMatrix", - overwrite = TRUE) +dgc = readRDS(system.file("data", "dgc.rds", package = "dbMatrix")) +con <- DBI::dbConnect(duckdb::duckdb(), ":memory:") +dbSparse <- dbMatrix( + value = dgc, + con = con, + name = "sparse_matrix", + class = "dbSparseMatrix", + overwrite = TRUE +) dbSparse } +\concept{dbMatrix} diff --git a/man/dbMatrix_from_tbl.Rd b/man/dbMatrix_from_tbl.Rd index fb004fc..81b9ef6 100644 --- a/man/dbMatrix_from_tbl.Rd +++ b/man/dbMatrix_from_tbl.Rd @@ -26,7 +26,7 @@ dbMatrix_from_tbl( \item{con}{DBI or duckdb connection object \code{(required)}} } \value{ -dbMatrix object +`dbMatrix` object } \description{ Construcst a \code{dbSparseMatrix} object from a \code{tbl_duckdb_connection} object. diff --git a/man/dbSparseMatrix-class.Rd b/man/dbSparseMatrix-class.Rd deleted file mode 100644 index 9b32534..0000000 --- a/man/dbSparseMatrix-class.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/classes.R -\docType{class} -\name{dbSparseMatrix-class} -\alias{dbSparseMatrix-class} -\alias{dbSparseMatrix} -\title{S4 Class for dbSparseMatrix} -\description{ -Representation of sparse matrices using an on-disk database. -Inherits from dbMatrix. -} diff --git a/man/dim-dbMatrix-method.Rd b/man/dim-dbMatrix-method.Rd new file mode 100644 index 0000000..ce658fe --- /dev/null +++ b/man/dim-dbMatrix-method.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/operations.R +\name{dim,dbMatrix-method} +\alias{dim,dbMatrix-method} +\title{Dimensions of an Object} +\usage{ +\S4method{dim}{dbMatrix}(x) +} +\description{ +See ?\link{\code{base::dim}} for more details. +} +\concept{matrix_props} diff --git a/man/get_tblName.Rd b/man/get_tblName.Rd index ca43b7c..0b86885 100644 --- a/man/get_tblName.Rd +++ b/man/get_tblName.Rd @@ -7,7 +7,7 @@ get_tblName(dbMatrix) } \arguments{ -\item{dbData}{} +\item{dbMatrix}{} } \description{ get_tblName diff --git a/man/head-dbMatrix-method.Rd b/man/head-dbMatrix-method.Rd deleted file mode 100644 index a9607b3..0000000 --- a/man/head-dbMatrix-method.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/operations.R -\name{head,dbMatrix-method} -\alias{head,dbMatrix-method} -\title{head} -\usage{ -\S4method{head}{dbMatrix}(x, n = 6L, ...) -} -\description{ -head -} diff --git a/man/head_tail.Rd b/man/head_tail.Rd new file mode 100644 index 0000000..8526253 --- /dev/null +++ b/man/head_tail.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/operations.R +\name{head,dbMatrix-method} +\alias{head,dbMatrix-method} +\alias{tail,dbMatrix-method} +\title{Return the First or Last Parts of an Object} +\usage{ +\S4method{head}{dbMatrix}(x, n = 6L, ...) + +\S4method{tail}{dbMatrix}(x, n = 6L, ...) +} +\description{ +See ?\link{\code{utils::head}} for more details. + +See ?\link{\code{utils::tail}} for more details. +} +\concept{matrix_props} diff --git a/man/hidden_aliases.Rd b/man/hidden_aliases.Rd deleted file mode 100644 index 02b2b71..0000000 --- a/man/hidden_aliases.Rd +++ /dev/null @@ -1,151 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/extract.R, R/names.R, R/operations.R -\name{[,dbMatrix,missing,missing,missing-method} -\alias{[,dbMatrix,missing,missing,missing-method} -\alias{[<-,dbMatrix,missing,missing,ANY-method} -\alias{[,dbMatrix,dbIndex,missing,missing-method} -\alias{[,dbMatrix,missing,dbIndex,missing-method} -\alias{[,dbMatrix,dbIndex,dbIndex,missing-method} -\alias{rownames,dbMatrix-method} -\alias{rownames<-,dbMatrix-method} -\alias{colnames,dbMatrix-method} -\alias{colnames<-,dbMatrix,ANY-method} -\alias{dimnames,dbMatrix-method} -\alias{dimnames<-,dbMatrix,list-method} -\alias{Arith,dbMatrix,ANY-method} -\alias{Arith,ANY,dbMatrix-method} -\alias{Arith,dbMatrix,dbMatrix-method} -\alias{Ops,dbMatrix,ANY-method} -\alias{Ops,ANY,dbMatrix-method} -\alias{Ops,dbMatrix,dbMatrix-method} -\alias{rowSums,dbDenseMatrix-method} -\alias{rowSums,dbSparseMatrix-method} -\alias{colSums,dbDenseMatrix-method} -\alias{colSums,dbSparseMatrix-method} -\alias{rowMeans,dbDenseMatrix-method} -\alias{rowMeans,dbSparseMatrix-method} -\alias{colMeans,dbDenseMatrix-method} -\alias{colMeans,dbSparseMatrix-method} -\alias{colSds,dbDenseMatrix-method} -\alias{colSds,dbSparseMatrix-method} -\alias{rowSds,dbDenseMatrix-method} -\alias{rowSds,dbSparseMatrix-method} -\alias{mean,dbDenseMatrix-method} -\alias{mean,dbSparseMatrix-method} -\alias{log,dbMatrix-method} -\alias{t,dbMatrix-method} -\alias{ncol,dbMatrix-method} -\alias{dim,dbMatrix-method} -\title{rowSums} -\usage{ -\S4method{[}{dbMatrix,missing,missing,missing}(x, i, j) - -\S4method{[}{dbMatrix,missing,missing,ANY}(x, i, j) <- value - -\S4method{[}{dbMatrix,dbIndex,missing,missing}(x, i, j, ..., drop = TRUE) - -\S4method{[}{dbMatrix,missing,dbIndex,missing}(x, i, j, ..., drop = TRUE) - -\S4method{[}{dbMatrix,dbIndex,dbIndex,missing}(x, i, j, ..., drop = TRUE) - -\S4method{rownames}{dbMatrix}(x) - -\S4method{rownames}{dbMatrix}(x) - -\S4method{rownames}{dbMatrix}(x) <- value - -\S4method{colnames}{dbMatrix}(x) - -\S4method{colnames}{dbMatrix}(x) - -\S4method{colnames}{dbMatrix,ANY}(x) <- value - -\S4method{dimnames}{dbMatrix}(x) - -\S4method{dimnames}{dbMatrix,list}(x) <- value - -\S4method{Arith}{dbMatrix,ANY}(e1, e2) - -\S4method{Arith}{ANY,dbMatrix}(e1, e2) - -\S4method{Arith}{dbMatrix,dbMatrix}(e1, e2) - -\S4method{Ops}{dbMatrix,ANY}(e1, e2) - -\S4method{Ops}{ANY,dbMatrix}(e1, e2) - -\S4method{Ops}{dbMatrix,dbMatrix}(e1, e2) - -\S4method{rowSums}{dbDenseMatrix}(x, ...) - -\S4method{rowSums}{dbSparseMatrix}(x, ...) - -\S4method{colSums}{dbDenseMatrix}(x, ...) - -\S4method{colSums}{dbSparseMatrix}(x, ...) - -\S4method{rowMeans}{dbDenseMatrix}(x, ...) - -\S4method{rowMeans}{dbSparseMatrix}(x, ...) - -\S4method{colMeans}{dbDenseMatrix}(x, ...) - -\S4method{colMeans}{dbSparseMatrix}(x, ...) - -\S4method{colSds}{dbDenseMatrix}(x, ...) - -\S4method{colSds}{dbSparseMatrix}(x, ...) - -\S4method{rowSds}{dbDenseMatrix}(x, ...) - -\S4method{rowSds}{dbSparseMatrix}(x, ...) - -\S4method{mean}{dbDenseMatrix}(x, ...) - -\S4method{mean}{dbSparseMatrix}(x, ...) - -\S4method{log}{dbMatrix}(x, ...) - -\S4method{t}{dbMatrix}(x) - -\S4method{ncol}{dbMatrix}(x) - -\S4method{dim}{dbMatrix}(x) -} -\description{ -rowSums - -rowSums - -colSums - -colSums - -rowMeans - -rowMeans - -colMeans - -colMeans - -colSds - -colSds - -rowSds - -rowSds - -mean - -mean - -log - -Transpose - -ncol - -dim -} diff --git a/man/log-dbMatrix-method.Rd b/man/log-dbMatrix-method.Rd new file mode 100644 index 0000000..a7857a3 --- /dev/null +++ b/man/log-dbMatrix-method.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/operations.R +\name{log,dbMatrix-method} +\alias{log,dbMatrix-method} +\title{Logarithms and Exponentials} +\usage{ +\S4method{log}{dbMatrix}(x, ...) +} +\description{ +See ?\link{\code{base::log}} for more details. +} +\concept{transform} diff --git a/man/mean.Rd b/man/mean.Rd new file mode 100644 index 0000000..9e967b2 --- /dev/null +++ b/man/mean.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/operations.R +\name{mean,dbDenseMatrix-method} +\alias{mean,dbDenseMatrix-method} +\alias{mean,dbSparseMatrix-method} +\title{Arithmetic Mean} +\usage{ +\S4method{mean}{dbDenseMatrix}(x, ...) + +\S4method{mean}{dbSparseMatrix}(x, ...) +} +\description{ +See ?\link{\code{base::mean}} for more details. + +See ?\link{\code{base::mean}} for more details. +} +\concept{summary} diff --git a/man/nrow.Rd b/man/nrow.Rd deleted file mode 100644 index 151d0f2..0000000 --- a/man/nrow.Rd +++ /dev/null @@ -1,13 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/operations.R -\name{nrow} -\alias{nrow} -\alias{ncol} -\title{The number of rows/cols} -\usage{ -\S4method{nrow}{dbMatrix}(x) -} -\description{ -\code{nrow} and \code{ncol} return the number of rows or columns present in -\code{x}. -} diff --git a/man/nrow_ncol.Rd b/man/nrow_ncol.Rd new file mode 100644 index 0000000..d376b70 --- /dev/null +++ b/man/nrow_ncol.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/operations.R +\name{nrow,dbMatrix-method} +\alias{nrow,dbMatrix-method} +\alias{ncol,dbMatrix-method} +\title{The Number of Rows/Columns of an Array} +\usage{ +\S4method{nrow}{dbMatrix}(x) + +\S4method{ncol}{dbMatrix}(x) +} +\description{ +See ?\link{\code{base::nrow}} for more details. + +See ?\link{\code{base::ncol}} for more details. +} +\concept{matrix_props} diff --git a/man/precompute.Rd b/man/precompute.Rd index fbd3b65..886fd9c 100644 --- a/man/precompute.Rd +++ b/man/precompute.Rd @@ -4,7 +4,7 @@ \alias{precompute} \title{Precompute a dbMatrix table in a database} \usage{ -precompute(conn, m, n, name) +precompute(conn, m, n, name, verbose = TRUE) } \arguments{ \item{conn}{duckdb database connection} @@ -14,6 +14,11 @@ precompute(conn, m, n, name) \item{n}{number of columns of precomputed dbMatrix table} \item{name}{name of the precomputed dbMatrix table to be created} + +\item{verbose}{logical, print progress messages. default: TRUE.} +} +\value{ +tbl_dbi } \description{ Precomputes a dbMatrix table in a specificied database connection. @@ -36,3 +41,5 @@ table set \code{options(dbMatrix.precomp = NULL)} in the R console. con = DBI::dbConnect(duckdb::duckdb(), ":memory:") precompute(con = con , m = 100, n = 100, name = "precomputed_table") } +\concept{dbMatrix} +\keyword{internal} diff --git a/man/readMM.Rd b/man/readMM.Rd deleted file mode 100644 index 77299ba..0000000 --- a/man/readMM.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dbMatrix.R -\name{readMM} -\alias{readMM} -\title{read_MM} -\usage{ -readMM(con, value, name = "dbMatrix", overwrite = FALSE, ...) -} -\arguments{ -\item{con}{DBI or duckdb connection object \code{(required)}} - -\item{value}{path to .mtx or .mtx.gz file \code{(required)}} - -\item{name}{name to assign file within database \code{(optional)}. -default: "dbMatrix"} - -\item{overwrite}{whether to overwrite if `name` already exists in database. -\code{(required)}. default: FALSE} - -\item{...}{additional params to pass} -} -\value{ -tbl_dbi object -} -\description{ -Read matrix market file (.mtx or .mtx.gz) into database -} -\details{ -Construct a database VIEW of a .mtx or .mtx.gz file with columns 'i', 'j', -and 'x' representing the row index, column index, and value of the matrix, -respectively. - -By default 'i' and 'j' are of type BIGINT and 'x' is of type DOUBLE. -Note: lack of support in R for BIGINT may cause errors when pulling data -into memory without proper type conversion. - -By default, .mtx files are expected to contain two lines representing the -standard header information. -} -\examples{ -print('TODO') -} diff --git a/man/read_matrix.Rd b/man/read_matrix.Rd deleted file mode 100644 index 1ec34c7..0000000 --- a/man/read_matrix.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dbMatrix.R -\name{read_matrix} -\alias{read_matrix} -\title{read_matrix} -\usage{ -read_matrix(con, value, name = "dbMatrix", overwrite = FALSE, ...) -} -\arguments{ -\item{con}{DBI or duckdb connection object \code{(required)}} - -\item{value}{path to .txt, .csv, .tsv or .gzip/.gz variants \code{(required)}} - -\item{name}{name to assign file within database \code{(optional)}. -default: "dbMatrix"} - -\item{overwrite}{whether to overwrite if `name` already exists in database. -\code{(required)}. default: FALSE} - -\item{...}{additional params to pass} -} -\value{ -tbl_dbi object -} -\description{ -Read tabular matrix files into database -} -\details{ -Construct a database VIEW of a .csv, .tsv, or .txt files or their .gz/.gzip -variants -} -\examples{ -print('TODO') -} diff --git a/man/row_col_sums_means.Rd b/man/row_col_sums_means.Rd new file mode 100644 index 0000000..f37081c --- /dev/null +++ b/man/row_col_sums_means.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/operations.R +\name{rowSums,dbDenseMatrix-method} +\alias{rowSums,dbDenseMatrix-method} +\alias{rowSums,dbSparseMatrix-method} +\alias{colSums,dbDenseMatrix-method} +\alias{colSums,dbSparseMatrix-method} +\alias{rowMeans,dbDenseMatrix-method} +\alias{rowMeans,dbSparseMatrix-method} +\alias{colMeans,dbDenseMatrix-method} +\alias{colMeans,dbSparseMatrix-method} +\title{Form Row and Column Sums and Means} +\usage{ +\S4method{rowSums}{dbDenseMatrix}(x, na.rm = FALSE, dims = 1, ...) + +\S4method{rowSums}{dbSparseMatrix}(x, na.rm = FALSE, dims = 1, ...) + +\S4method{colSums}{dbDenseMatrix}(x, na.rm = FALSE, dims = 1, ...) + +\S4method{colSums}{dbSparseMatrix}(x, na.rm = FALSE, dims = 1, ...) + +\S4method{rowMeans}{dbDenseMatrix}(x, na.rm = FALSE, dims = 1, ...) + +\S4method{rowMeans}{dbSparseMatrix}(x, na.rm = FALSE, dims = 1, ...) + +\S4method{colMeans}{dbDenseMatrix}(x, na.rm = FALSE, dims = 1, ...) + +\S4method{colMeans}{dbSparseMatrix}(x, na.rm = FALSE, dims = 1, ...) +} +\description{ +See ?\link{\code{base::rowSums}} for more details. + +See ?\link{\code{base::rowSums}} for more details. + +See ?\link{\code{base::colSums}} for more details. + +See ?\link{\code{base::colSums}} for more details. + +See ?\link{\code{base::rowMeans}} for more details. + +See ?\link{\code{base::rowMeans}} for more details. + +See ?\link{\code{base::colMeans}} for more details. + +See ?\link{\code{base::colMeans}} for more details. +} +\concept{summary} diff --git a/man/sds.Rd b/man/sds.Rd new file mode 100644 index 0000000..8ac68fb --- /dev/null +++ b/man/sds.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/operations.R +\name{colSds,dbDenseMatrix-method} +\alias{colSds,dbDenseMatrix-method} +\alias{colSds,dbSparseMatrix-method} +\alias{rowSds,dbDenseMatrix-method} +\alias{rowSds,dbSparseMatrix-method} +\title{Calculates the standard deviation for each row (column) of a matrix-like object} +\usage{ +\S4method{colSds}{dbDenseMatrix}( + x, + rows = NULL, + cols = NULL, + na.rm = FALSE, + center = NULL, + ..., + useNames = TRUE +) + +\S4method{colSds}{dbSparseMatrix}( + x, + rows = NULL, + cols = NULL, + na.rm = FALSE, + center = NULL, + ..., + useNames = TRUE +) + +\S4method{rowSds}{dbDenseMatrix}( + x, + rows = NULL, + cols = NULL, + na.rm = FALSE, + center = NULL, + ..., + useNames = TRUE +) + +\S4method{rowSds}{dbSparseMatrix}( + x, + rows = NULL, + cols = NULL, + na.rm = FALSE, + center = NULL, + ..., + useNames = TRUE +) +} +\description{ +See ?\link{\code{MatrixGenerics::colSds}} for more details. + +See ?\link{\code{MatrixGenerics::colSds}} for more details. + +See ?\link{\code{MatrixGenerics::rowSds}} for more details. + +See ?\link{\code{MatrixGenerics::rowSds}} for more details. +} +\concept{summary} diff --git a/man/t-dbMatrix-method.Rd b/man/t-dbMatrix-method.Rd new file mode 100644 index 0000000..1058194 --- /dev/null +++ b/man/t-dbMatrix-method.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/operations.R +\name{t,dbMatrix-method} +\alias{t,dbMatrix-method} +\title{Matrix Transpose} +\usage{ +\S4method{t}{dbMatrix}(x) +} +\description{ +See ?\link{\code{base::t}} for more details. +} +\concept{transform} diff --git a/man/tail-dbMatrix-method.Rd b/man/tail-dbMatrix-method.Rd deleted file mode 100644 index d80515b..0000000 --- a/man/tail-dbMatrix-method.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/operations.R -\name{tail,dbMatrix-method} -\alias{tail,dbMatrix-method} -\title{tail} -\usage{ -\S4method{tail}{dbMatrix}(x, n = 6L, ...) -} -\description{ -tail -} diff --git a/man/toDbDense.Rd b/man/toDbDense.Rd deleted file mode 100644 index 4ae7801..0000000 --- a/man/toDbDense.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dbMatrix.R -\name{toDbDense} -\alias{toDbDense} -\title{Convert dbSparseMatrix to dbDenseMatrix} -\usage{ -toDbDense(db_sparse) -} -\arguments{ -\item{db_sparse}{dbSparseMatrix object.} -} -\description{ -Convert a dbSparseMatrix to a dbDenseMatrix. -} -\examples{ -TODO - -} -\keyword{internal} From ca5d68d1e4d5afacac2551bfbc6beed13106a40c Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:41:10 -0700 Subject: [PATCH 22/27] fix: `.check_value` checks for invalid file path --- R/input_validation.R | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/R/input_validation.R b/R/input_validation.R index 81bbcab..92bffcf 100644 --- a/R/input_validation.R +++ b/R/input_validation.R @@ -20,16 +20,18 @@ #' @param x A \link{Matrix}, \link{matrix}, or \link{tbl_duckdb_connection} object #' @keywords internal .check_value <- function(value){ - is_mat = inherits(value, 'Matrix') | inherits(value, 'matrix') - is_duckdb = inherits(value, 'tbl_duckdb_connection') - is_null = is.null(value) if(is.character(value)){ - is_valid_file = file.exists(value) - } else { - is_valid_file = FALSE + if(!file.exists(value)){ + stopf( + 'File does not exist. Please provide a valid file path.' + ) + } + return(invisible(NULL)) } + is_valid <- inherits(value, + c('Matrix', 'matrix', 'tbl_duckdb_connection')) || is.null(value) - if(!(is_mat | is_duckdb | is_null | is_valid_file)) { + if(!(is_valid)) { stopf( 'Invalid "value" input passed.' ) From ac923884415b4cbd691dbd3f7c06a1f7cae96189 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:41:34 -0700 Subject: [PATCH 23/27] refactor: `precompute()` --- R/precompute.R | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/R/precompute.R b/R/precompute.R index 06af458..9e8746a 100644 --- a/R/precompute.R +++ b/R/precompute.R @@ -4,6 +4,7 @@ #' @param m number of rows of precomputed dbMatrix table #' @param n number of columns of precomputed dbMatrix table #' @param name name of the precomputed dbMatrix table to be created +#' @param verbose logical, print progress messages. default: TRUE. #' @description #' Precomputes a dbMatrix table in a specificied database connection. #' This can speed up operations that involve breaking @@ -21,22 +22,18 @@ #' \code{n_rows} and \code{num_cols}, or to manually remove the precomputed #' table set \code{options(dbMatrix.precomp = NULL)} in the R console. #' -#' @return NULL -#' @export -#' +#' @return tbl_dbi +#' @keywords internal +#' @concept dbMatrix #' @examples #' con = DBI::dbConnect(duckdb::duckdb(), ":memory:") #' precompute(con = con , m = 100, n = 100, name = "precomputed_table") -precompute <- function(conn, m, n, name){ +precompute <- function(conn, m, n, name, verbose = TRUE){ + # create a random sufix with precomp_ as suffix if name is NULL using + # dbplyr internal random name generator + # input validation .check_con(conn = conn) - if(name %in% DBI::dbListTables(conn)){ - options(dbMatrix.precomp = name) - str <- glue::glue("Precomputed dbMatrix '{name}' with - {m} rows and {n} columns") - cat(str, "\n") - return() - } .check_name(name = name) if(!(is.numeric(m)) || !(is.numeric(n))){ @@ -61,7 +58,11 @@ precompute <- function(conn, m, n, name){ # set global variable for precomputed matrix options(dbMatrix.precomp = name) - str <- glue::glue("Precomputed dbMatrix '{name}' with + if(verbose){ + str <- glue::glue("Precomputed tbl '{name}' with {n_rows} rows and {n_cols} columns") - cat(str, "\n") + cat(str, "\n") + } + + return(key) } From bd853b5f0690bfdae944ed9e964a5e9dd3580968 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:41:56 -0700 Subject: [PATCH 24/27] refactor: `dbDisconnect()` for dbMatrix --- R/utils.R | 57 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/R/utils.R b/R/utils.R index 64decad..d5da43f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -5,7 +5,6 @@ NULL # Print Formatting #### #' @title Wrap message -#' @name wrap_msg #' @param ... additional strings and/or elements to pass to wrap_txt #' @param sep how to join elements of string (default is one space) #' @keywords internal @@ -15,7 +14,6 @@ wrap_msg = function(..., sep = ' ') { } #' @title Wrap text -#' @name wrap_txt #' @param ... additional params to pass #' @param sep how to join elements of string (default is one space) #' @param strWidth externally set wrapping width. (default value of 100 is not effected) @@ -39,7 +37,7 @@ wrap_txt = function(..., sep = ' ', strWidth = 100, errWidth = FALSE) { # Custom stop function stopf = function(...) { - wrap_txt('dbMatrix:', ..., errWidth = TRUE) |> + wrap_txt('dbMatrix:\n', ..., errWidth = TRUE) |> stop(call. = FALSE) } @@ -52,7 +50,6 @@ vector_to_string = function(x) { } #' @title Generate array for pretty printing of matrix values -#' @name print_array #' @param i,j,x matched vectors of integers in i and j, with value in x #' @param dims dimensions of the array (integer vector of 2) #' @param fill fill character @@ -105,22 +102,36 @@ ij_array_map = function(i, j, dims) { # DBI #### -## dbDisconnect #### -#' @title dbDisconnect -#' @rdname DBI -#' @export -setMethod('dbDisconnect', signature(x = 'dbMatrix'), - function(x, ...){ - con <- get_con(x) - DBI::dbDisconnect(conn = con, shutdown = TRUE) - }) - -## dbListTables #### -#' @title dbListTables -#' @rdname DBI -#' @export -setMethod('dbListTables', signature(x = 'dbMatrix'), - function(x, ...){ - con <- get_con(x) - DBI::dbListTables(conn = con) - }) +#' ## dbDisconnect #### +#' #' @title dbDisconnect +#' #' @rdname DBI +#' #' @export +#' setMethod('dbDisconnect', signature(x = 'dbMatrix'), +#' function(x, ...){ +#' con <- get_con(x) +#' DBI::dbDisconnect(conn = con, shutdown = TRUE) +#' }) +#' +#' ## dbListTables #### +#' #' @title dbListTables +#' #' @rdname DBI +#' #' @export +#' setMethod('dbListTables', signature(x = 'dbMatrix'), +#' function(x, ...){ +#' con <- get_con(x) +#' DBI::dbListTables(conn = con) +#' }) + +# dbplyr #### + +#' Generate table names +#' @details +#' based on dbplyr::unique_table_name +#' +#' @noRd +#' @keywords internal +unique_table_name <- function(prefix = ""){ + vals <- c(letters, LETTERS, 0:9) + name <- paste0(sample(vals, 10, replace = TRUE), collapse = "") + paste0(prefix, "_", name) +} From 388924a08f3e6d113b83b81daa9c62eea1c87701 Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:42:52 -0700 Subject: [PATCH 25/27] feat: add new tests -- TODO: ops for dbsm and dbdm --- tests/testthat/test-arith-dbdm-dbdm.R | 75 ++++++++++++++++++ tests/testthat/test-arith-dbdm-scalar.R | 71 +++++++++++++++++ tests/testthat/test-arith-dbdm-vector.R | 65 ++++++++++++++++ tests/testthat/test-arith-dbsm-dbsm.R | 76 +++++++++++++++++++ ...rith-scalar.R => test-arith-dbsm-scalar.R} | 0 tests/testthat/test-arith-dbsm-vector.R | 72 ++++++++++++++++++ 6 files changed, 359 insertions(+) create mode 100644 tests/testthat/test-arith-dbdm-dbdm.R create mode 100644 tests/testthat/test-arith-dbdm-scalar.R create mode 100644 tests/testthat/test-arith-dbdm-vector.R create mode 100644 tests/testthat/test-arith-dbsm-dbsm.R rename tests/testthat/{test-arith-scalar.R => test-arith-dbsm-scalar.R} (100%) create mode 100644 tests/testthat/test-arith-dbsm-vector.R diff --git a/tests/testthat/test-arith-dbdm-dbdm.R b/tests/testthat/test-arith-dbdm-dbdm.R new file mode 100644 index 0000000..151d9a9 --- /dev/null +++ b/tests/testthat/test-arith-dbdm-dbdm.R @@ -0,0 +1,75 @@ +# silence deprecated internal functions +rlang::local_options(lifecycle_verbosity = "quiet") + +# ---------------------------------------------------------------------------- # +# Load the RDS file in the 'data' folder +dgc = readRDS(system.file("data", "dgc.rds", package = "dbMatrix")) +mat = as.matrix(dgc + 1) + +con1 = DBI::dbConnect(duckdb::duckdb(), ":memory:") + +dbdm = dbMatrix::dbMatrix(value = mat, + con = con1, + name = 'mat', + class = "dbDenseMatrix", + overwrite = TRUE) + +# ---------------------------------------------------------------------------- # +# Test scalar arithmetic +res_mat = mat + mat +res_dbdm = dbdm + dbdm +res_dbdm = as_matrix(res_dbdm) +test_that("+ matrix equal", { + expect_equal(res_mat, res_dbdm) +}) + + +# FIXME: +# Interesting edge case where the result is a zero matrix +# How to handle this? +# res_mat = mat - mat +# res_dbdm = dbdm - dbdm +# res_dbdm = as_matrix(res_dbdm) +# test_that("- matrix equal", { +# expect_equal(res_mat, res_dbdm) +# }) + +res_mat = mat * mat +res_dbdm = dbdm * dbdm +res_dbdm = as_matrix(res_dbdm) +test_that("* matrix equal", { + expect_equal(res_mat, res_dbdm) +}) + +# FIXME: +# Support for division by 0 +res_mat = mat / mat +res_dbdm = dbdm / dbdm +res_dbdm = as_matrix(res_dbdm) +test_that("/ matrix equal", { + expect_equal(res_mat, res_dbdm) +}) + +# FIXME: NaN and 1 logic +# res_mat = mat ^ mat +# res_dbdm = dbdm ^ dbdm +# res_dbdm = as_matrix(res_dbdm) +# test_that("^ matrix equal", { +# expect_equal(res_mat, res_dbdm) +# }) + +# FIXME: division by 0, NaN and 1 logic +# res_mat = mat %% mat +# res_dbdm = dbdm %% dbdm +# res_dbdm = as_matrix(res_dbdm) +# test_that("%% matrix equal", { +# expect_equal(res_mat, res_dbdm) +# }) + +# FIXME: division by 0, Nan and 1 logic +# res_mat = mat %/% mat +# res_dbdm = dbdm %/% dbdm +# res_dbdm = as_matrix(res_dbdm) +# test_that("%/% matrix equal", { +# expect_equal(res_mat, res_dbdm) +# }) diff --git a/tests/testthat/test-arith-dbdm-scalar.R b/tests/testthat/test-arith-dbdm-scalar.R new file mode 100644 index 0000000..a5786c0 --- /dev/null +++ b/tests/testthat/test-arith-dbdm-scalar.R @@ -0,0 +1,71 @@ +# silence deprecated internal functions +rlang::local_options(lifecycle_verbosity = "quiet") + +# ---------------------------------------------------------------------------- # +# Load the RDS file in the 'data' folder +dgc = readRDS(system.file("data", "dgc.rds", package = "dbMatrix")) +mat = as.matrix(dgc + 1) + +con1 = DBI::dbConnect(duckdb::duckdb(), ":memory:") + +dbdm = dbMatrix::dbMatrix(value = mat, + con = con1, + name = 'mat', + class = "dbDenseMatrix", + overwrite = TRUE) + +# ---------------------------------------------------------------------------- # +# Test scalar arithmetic + +res_mat = mat + 1 +res_mat = res_mat |> as.matrix() #dgeMatrix casting +res_dbdm = dbdm + 1 +res_dbdm = as_matrix(res_dbdm) + +test_that("+ 1 equal", { + expect_equal(res_mat, res_dbdm) +}) + + +res_mat = mat - 1 +res_mat = res_mat |> as.matrix() #dgeMatrix casting +res_dbdm = dbdm - 1 +res_dbdm = as_matrix(res_dbdm) + +test_that("-1 equal", { + expect_equal(res_mat, res_dbdm) +}) + + +res_mat = mat * 10 +res_dbdm = dbdm * 10 +res_dbdm = as_matrix(res_dbdm) + +test_that("* 10 equal", { + expect_equal(res_mat, res_dbdm) +}) + + +res_mat = mat + 0 +res_dbdm = dbdm + 0 +res_dbdm = as_matrix(res_dbdm) + +test_that("+0 equal", { + expect_equal(res_mat, res_dbdm) +}) + +res_mat = mat / 10 +res_dbdm = dbdm / 10 +res_dbdm = as_matrix(res_dbdm) + +test_that("/10 equal", { + expect_equal(res_mat, res_dbdm) +}) + +# res_mat = mat / 0 +# res_dbdm = dbdm / 0 +# res_dbdm = as_matrix(res_dbdm) +# +# test_that("/ 0 equal", { +# expect_equal(res_mat, res_dbdm) +# }) diff --git a/tests/testthat/test-arith-dbdm-vector.R b/tests/testthat/test-arith-dbdm-vector.R new file mode 100644 index 0000000..62e4df8 --- /dev/null +++ b/tests/testthat/test-arith-dbdm-vector.R @@ -0,0 +1,65 @@ +# silence deprecated internal functions +rlang::local_options(lifecycle_verbosity = "quiet") + +# ---------------------------------------------------------------------------- # +# Load the RDS file in the 'data' folder +dgc = readRDS(system.file("data", "dgc.rds", package = "dbMatrix")) +mat = as.matrix(dgc + 1) + +con1 = DBI::dbConnect(duckdb::duckdb(), ":memory:") + +dbdm = dbMatrix::dbMatrix(value = mat, + con = con1, + name = 'mat', + class = "dbDenseMatrix", + overwrite = TRUE) + +# ---------------------------------------------------------------------------- # +# Test dbMatrix-vector arithmetic + +res_mat = mat + c(1,2,3) +res_dbdm = dbdm + c(1,2,3) +res_dbdm = as_matrix(res_dbdm) +test_that("+ 1 equal", { + expect_equal(res_mat, res_dbdm) +}) + + +res_mat = mat - c(1,2,3) +res_dbdm = dbdm - c(1,2,3) +res_dbdm = as_matrix(res_dbdm) +test_that("-1 equal", { + expect_equal(res_mat, res_dbdm) +}) + + +res_mat = mat * c(1,2,3) +res_dbdm = dbdm * c(1,2,3) +res_dbdm = as_matrix(res_dbdm) +test_that("* 10 equal", { + expect_equal(res_mat, res_dbdm) +}) + + +res_mat = mat + c(1,2,3) +res_dbdm = dbdm + c(1,2,3) +res_dbdm = as_matrix(res_dbdm) +test_that("+0 equal", { + expect_equal(res_mat, res_dbdm) +}) + +res_mat = mat / c(1,2,3) +res_dbdm = dbdm / c(1,2,3) +res_dbdm = as_matrix(res_dbdm) +test_that("/10 equal", { + expect_equal(res_mat, res_dbdm) +}) + +#FIXME: support for division by zero +# res_mat = mat / 0 +# res_dbdm = dbdm / 0 +# res_dbdm = as_matrix(res_dbdm) +# +# test_that("/ 0 equal", { +# expect_equal(res_mat, res_dbdm) +# }) diff --git a/tests/testthat/test-arith-dbsm-dbsm.R b/tests/testthat/test-arith-dbsm-dbsm.R new file mode 100644 index 0000000..e1afad4 --- /dev/null +++ b/tests/testthat/test-arith-dbsm-dbsm.R @@ -0,0 +1,76 @@ +# silence deprecated internal functions +rlang::local_options(lifecycle_verbosity = "quiet") + +# ---------------------------------------------------------------------------- # +# Load the RDS file in the 'data' folder +dgc = readRDS(system.file("data", "dgc.rds", package = "dbMatrix")) + +con1 = DBI::dbConnect(duckdb::duckdb(), ":memory:") + +dbsm = dbMatrix::dbMatrix(value = dgc, + con = con1, + name = 'dgc', + class = "dbSparseMatrix", + overwrite = TRUE) + +# ---------------------------------------------------------------------------- # +# Test scalar arithmetic + +res_dgc = dgc + dgc +res_dbsm = dbsm + dbsm +res_dbsm = as_matrix(res_dbsm) + +test_that("+ matrix equal", { + expect_equal(res_dgc, res_dbsm) +}) + + +# FIXME: +# Interesting edge case where the result is a zero matrix +# How to handle this? +# res_dgc = dgc - dgc +# res_dbsm = dbsm - dbsm +# res_dbsm = as_matrix(res_dbsm) +# test_that("- matrix equal", { +# expect_equal(res_dgc, res_dbsm) +# }) + +res_dgc = dgc * dgc +res_dbsm = dbsm * dbsm +res_dbsm = as_matrix(res_dbsm) +test_that("* matrix equal", { + expect_equal(res_dgc, res_dbsm) +}) + +# FIXME: +# Support for division by 0 +# res_dgc = dgc / dgc +# res_dbsm = dbsm / dbsm +# res_dbsm = as_matrix(res_dbsm) +# test_that("/ matrix equal", { +# expect_equal(res_dgc, res_dbsm) +# }) + +# FIXME: NaN and 1 logic +# res_dgc = dgc ^ dgc +# res_dbsm = dbsm ^ dbsm +# res_dbsm = as_matrix(res_dbsm) +# test_that("^ matrix equal", { +# expect_equal(res_dgc, res_dbsm) +# }) + +# FIXME: division by 0, NaN and 1 logic +# res_dgc = dgc %% dgc +# res_dbsm = dbsm %% dbsm +# res_dbsm = as_matrix(res_dbsm) +# test_that("%% matrix equal", { +# expect_equal(res_dgc, res_dbsm) +# }) + +# FIXME: division by 0, Nan and 1 logic +# res_dgc = dgc %/% dgc +# res_dbsm = dbsm %/% dbsm +# res_dbsm = as_matrix(res_dbsm) +# test_that("%/% matrix equal", { +# expect_equal(res_dgc, res_dbsm) +# }) diff --git a/tests/testthat/test-arith-scalar.R b/tests/testthat/test-arith-dbsm-scalar.R similarity index 100% rename from tests/testthat/test-arith-scalar.R rename to tests/testthat/test-arith-dbsm-scalar.R diff --git a/tests/testthat/test-arith-dbsm-vector.R b/tests/testthat/test-arith-dbsm-vector.R new file mode 100644 index 0000000..0725b14 --- /dev/null +++ b/tests/testthat/test-arith-dbsm-vector.R @@ -0,0 +1,72 @@ +# silence deprecated internal functions +rlang::local_options(lifecycle_verbosity = "quiet") + +# ---------------------------------------------------------------------------- # +# Load the RDS file in the 'data' folder +dgc = readRDS(system.file("data", "dgc.rds", package = "dbMatrix")) + +con1 = DBI::dbConnect(duckdb::duckdb(), ":memory:") + +dbsm = dbMatrix::dbMatrix(value = dgc, + con = con1, + name = 'dgc', + class = "dbSparseMatrix", + overwrite = TRUE) + +# ---------------------------------------------------------------------------- # +# Test scalar arithmetic + +res_dgc = dgc + c(1,2,3) +res_dgc = res_dgc |> as.matrix() #dgeMatrix casting +res_dbsm = dbsm + c(1,2,3) +res_dbsm = as_matrix(res_dbsm) + +test_that("+ 1 equal", { + expect_equal(res_dgc, res_dbsm) +}) + + +res_dgc = dgc - c(1,2,3) +res_dgc = as.matrix(res_dgc) #dgeMatrix casting +res_dbsm = dbsm - c(1,2,3) +res_dbsm = as_matrix(res_dbsm) + +test_that("-1 equal", { + expect_equal(res_dgc, res_dbsm) +}) + + +res_dgc = dgc * c(1,2,3) +res_dbsm = dbsm * c(1,2,3) +res_dbsm = as_matrix(res_dbsm) + +test_that("* 10 equal", { + expect_equal(res_dgc, res_dbsm) +}) + + +res_dgc = dgc + c(1,2,3) +res_dgc = as.matrix(res_dgc) #dgeMatrix casting +res_dbsm = dbsm + c(1,2,3) +res_dbsm = as_matrix(res_dbsm) + +test_that("+0 equal", { + expect_equal(res_dgc, res_dbsm) +}) + +res_dgc = dgc / c(1,2,3) +res_dbsm = dbsm / c(1,2,3) +res_dbsm = as_matrix(res_dbsm) + +test_that("/10 equal", { + expect_equal(res_dgc, res_dbsm) +}) + +#FIXME: support for division by zero +# res_dgc = dgc / 0 +# res_dbsm = dbsm / 0 +# res_dbsm = as_matrix(res_dbsm) +# +# test_that("/ 0 equal", { +# expect_equal(res_dgc, res_dbsm) +# }) From 0e2b2769302ce3cd17d7e3c01c00053daacd686b Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:43:04 -0700 Subject: [PATCH 26/27] docs: update --- vignettes/arithmetic.Rmd | 5 ----- vignettes/operations.Rmd | 5 ----- 2 files changed, 10 deletions(-) diff --git a/vignettes/arithmetic.Rmd b/vignettes/arithmetic.Rmd index dcb0f15..af80b5a 100644 --- a/vignettes/arithmetic.Rmd +++ b/vignettes/arithmetic.Rmd @@ -75,11 +75,6 @@ dbsm * dbsm #### Matrix product TODO -### Shutdown -```{r} -dbMatrix::dbDisconnect(dbsm, shutdown = TRUE) -``` - ### Session Info ```{r} sessionInfo() diff --git a/vignettes/operations.Rmd b/vignettes/operations.Rmd index fde3103..3f6bbfa 100644 --- a/vignettes/operations.Rmd +++ b/vignettes/operations.Rmd @@ -169,11 +169,6 @@ dbMatrix::mean(dense) dim(dense) ``` -## Shutdown -```{r} -dbMatrix::dbDisconnect(dense, shutdown = TRUE) -``` - ## Session Info ```{r eval=TRUE, message=FALSE} sessionInfo() From 979945d9c1239d30c97d28574cb928254173441a Mon Sep 17 00:00:00 2001 From: Eddie Ruiz <32622519+Ed2uiz@users.noreply.github.com> Date: Wed, 14 Aug 2024 08:14:07 -0700 Subject: [PATCH 27/27] fix: remove tests from gha workflow --- .github/workflows/tests.yaml | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 .github/workflows/tests.yaml diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml deleted file mode 100644 index bb664b1..0000000 --- a/.github/workflows/tests.yaml +++ /dev/null @@ -1,26 +0,0 @@ -on: - push: - branches: [main, master, dev] - pull_request: - branches: [main, master, dev] - -name: Run tests - -jobs: - test: - runs-on: ubuntu-latest - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - steps: - - uses: actions/checkout@v3 - - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: any::testthat - needs: tests - - name: Run tests - run: | - testthat::test_dir("tests/testthat") - shell: Rscript {0}