to v0.1.0

talegari · Jun 1, 2023 · f7e57bf · f7e57bf
1 parent b821ee2
commit f7e57bf
Show file tree

Hide file tree

Showing 21 changed files with 582 additions and 299 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tidier
 Title: Enhanced 'mutate'
-Version: 0.0.1
+Version: 0.1.0
 Authors@R: 
     person("Srikanth",
            "Komala Sheshachala",,

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# tidier 0.1.0 (on github: 2023-06-01)
+
+* Exposed slider's `.complete` argument in `tidier::mutate`
+* bugfix: `mutate` can now modify a column (same name) in sliding operation. 
+
 # tidier 0.0.1
 
 * Added a `NEWS.md` file to track changes to the package.
diff --git a/R/mutate.R b/R/mutate.R
@@ -47,6 +47,10 @@
 #' @param .index (string, optional: Yes) name of index column
 #' @param .desc (logical_vector, default: FALSE) bool or logical vector of same
 #'   length as `.order_by`.
+#' @param .complete (flag, default: FALSE) This will be passed to
+#'   `slider::slide` / `slider::slide_vec`. Should the function be evaluated on
+#'   complete windows only? If FALSE, the default, then partial computations
+#'   will be allowed.
 #' @return data.frame
 #' @importFrom magrittr %>%
 #' @importFrom utils tail
@@ -72,37 +76,27 @@
 #' # Using a sample airquality dataset,
 #' # compute mean temp over last seven days in the same month for every row
 #'
+#' set.seed(101)
 #' airquality %>%
 #'   # create date column
-#'   dplyr::mutate(date_col = as.Date(paste("1973",
-#'                                          stringr::str_pad(Month,
-#'                                                           width = 2,
-#'                                                           side = "left",
-#'                                                           pad = "0"
-#'                                                           ),
-#'                                          stringr::str_pad(Day,
-#'                                                           width = 2,
-#'                                                           side = "left",
-#'                                                           pad = "0"
-#'                                                           ),
-#'                                          sep = "-"
-#'                                          )
-#'                                   )
-#'                 ) %>%
+#'   dplyr::mutate(date_col = lubridate::make_date(1973, Month, Day)) %>%
 #'   # create gaps by removing some days
 #'   dplyr::slice_sample(prop = 0.8) %>%
+#'   dplyr::arrange(date_col) %>%
 #'   # compute mean temperature over last seven days in the same month
-#'   mutate_(avg_temp_over_last_week = mean(Temp, na.rm = TRUE),
-#'           .order_by = "Day",
-#'           .by = "Month",
-#'           .frame = c(lubridate::days(7), # 7 days before current row
-#'                      lubridate::days(-1) # do not include current row
-#'                      ),
-#'           .index = "date_col"
-#'           )
+#'   tidier::mutate(avg_temp_over_last_week = mean(Temp, na.rm = TRUE),
+#'                  .order_by = "Day",
+#'                  .by = "Month",
+#'                  .frame = c(lubridate::days(7), # 7 days before current row
+#'                             lubridate::days(-1) # do not include current row
+#'                             ),
+#'                  .index = "date_col"
+#'                  )
 #' @export
 
-mutate_ = function(x, ..., .by, .order_by, .frame, .index, .desc = FALSE){
+mutate_ = function(x, ..., .by, .order_by, .frame, .index,
+                   .desc = FALSE, .complete = FALSE
+                   ){
 
   # capture expressions --------------------------------------------------------
   ddd = rlang::enquos(...)
@@ -182,9 +176,11 @@ mutate_ = function(x, ..., .by, .order_by, .frame, .index, .desc = FALSE){
                             x_copy,
                             .f = ~ as.list(dplyr::summarise(.x, !!!ddd)),
                             .before = .frame[1],
-                            .after  = .frame[2]
+                            .after  = .frame[2],
+                            .complete = .complete
                             )
                         ) %>%
+          remove_common_nested_columns(slide_output__) %>%
           tidyr::unnest_wider(slide_output__)
       } else {
         x_copy = x_copy %>%
@@ -194,9 +190,11 @@ mutate_ = function(x, ..., .by, .order_by, .frame, .index, .desc = FALSE){
                             .f = ~ as.list(dplyr::summarise(.x, !!!ddd)),
                             .i = x_copy[[.index]],
                             .before = .frame[1],
-                            .after  = .frame[2]
+                            .after  = .frame[2],
+                            .complete = .complete
                             )
                         ) %>%
+          remove_common_nested_columns(slide_output__) %>%
           tidyr::unnest_wider(slide_output__)
       }
     }
@@ -221,7 +219,8 @@ mutate_ = function(x, ..., .by, .order_by, .frame, .index, .desc = FALSE){
                                 chunk,
                                 .f = ~ as.list(dplyr::summarise(.x, !!!ddd)),
                                 .before = .frame[1],
-                                .after  = .frame[2]
+                                .after  = .frame[2],
+                                .complete = .complete
                                 )
                           )
         } else {
@@ -232,7 +231,8 @@ mutate_ = function(x, ..., .by, .order_by, .frame, .index, .desc = FALSE){
                                 .f = ~ as.list(dplyr::summarise(.x, !!!ddd)),
                                 .i  = chunk[[.index]],
                                 .before = .frame[1],
-                                .after  = .frame[2]
+                                .after  = .frame[2],
+                                .complete = .complete
                                 )
                           )
         }
@@ -252,6 +252,7 @@ mutate_ = function(x, ..., .by, .order_by, .frame, .index, .desc = FALSE){
         dplyr::ungroup() %>%
         dplyr::mutate(data__ = furrr::future_map(data__, fun_per_chunk)) %>%
         tidyr::unnest(data__) %>%
+        remove_common_nested_columns(slide_output__) %>%
         tidyr::unnest_wider(slide_output__)
 
     }
@@ -285,6 +286,7 @@ mutate_ = function(x, ..., .by, .order_by, .frame, .index, .desc = FALSE){
 #'   API](https://www.databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html).
 #'
 #'
+#'
 #'   Implementation Details:
 #'
 #'   - Iteration per row over the window is implemented using the versatile
@@ -309,6 +311,10 @@ mutate_ = function(x, ..., .by, .order_by, .frame, .index, .desc = FALSE){
 #'   [interval](https://lubridate.tidyverse.org/reference/interval.html)
 #'   objects. See examples.
 #' @param .index (expression, optional: Yes) index column
+#' @param .complete (flag, default: FALSE) This will be passed to
+#'   `slider::slide` / `slider::slide_vec`. Should the function be evaluated on
+#'   complete windows only? If FALSE, the default, then partial computations
+#'   will be allowed.
 #' @return data.frame
 #' @importFrom magrittr %>%
 #' @importFrom utils tail
@@ -335,36 +341,24 @@ mutate_ = function(x, ..., .by, .order_by, .frame, .index, .desc = FALSE){
 #' # Using a sample airquality dataset,
 #' # compute mean temp over last seven days in the same month for every row
 #'
+#' set.seed(101)
 #' airquality %>%
 #'   # create date column
-#'   dplyr::mutate(date_col = as.Date(paste("1973",
-#'                                          stringr::str_pad(Month,
-#'                                                           width = 2,
-#'                                                           side = "left",
-#'                                                           pad = "0"
-#'                                                           ),
-#'                                          stringr::str_pad(Day,
-#'                                                           width = 2,
-#'                                                           side = "left",
-#'                                                           pad = "0"
-#'                                                           ),
-#'                                          sep = "-"
-#'                                          )
-#'                                   )
-#'                 ) %>%
+#'   dplyr::mutate(date_col = lubridate::make_date(1973, Month, Day)) %>%
 #'   # create gaps by removing some days
 #'   dplyr::slice_sample(prop = 0.8) %>%
+#'   dplyr::arrange(date_col) %>%
 #'   # compute mean temperature over last seven days in the same month
-#'   mutate(avg_temp_over_last_week = mean(Temp, na.rm = TRUE),
-#'          .order_by = Day,
-#'          .by = Month,
-#'          .frame = c(lubridate::days(7), # 7 days before current row
-#'                     lubridate::days(-1) # do not include current row
-#'                     ),
-#'          .index = date_col
-#'          )
+#'   tidier::mutate(avg_temp_over_last_week = mean(Temp, na.rm = TRUE),
+#'                  .order_by = Day,
+#'                  .by = Month,
+#'                  .frame = c(lubridate::days(7), # 7 days before current row
+#'                             lubridate::days(-1) # do not include current row
+#'                             ),
+#'                  .index = date_col
+#'                  )
 #' @export
-mutate = function(x, ..., .by, .order_by, .frame, .index){
+mutate = function(x, ..., .by, .order_by, .frame, .index, .complete = FALSE){
 
   # capture expressions --------------------------------------------------------
   ddd = rlang::enquos(...)
@@ -462,9 +456,11 @@ mutate = function(x, ..., .by, .order_by, .frame, .index){
                             x_copy,
                             .f = ~ as.list(dplyr::summarise(.x, !!!ddd)),
                             .before = .frame[1],
-                            .after  = .frame[2]
+                            .after  = .frame[2],
+                            .complete = .complete
                             )
                         ) %>%
+          remove_common_nested_columns(slide_output__) %>%
           tidyr::unnest_wider(slide_output__)
       } else {
         x_copy = x_copy %>%
@@ -474,9 +470,11 @@ mutate = function(x, ..., .by, .order_by, .frame, .index){
                             .f = ~ as.list(dplyr::summarise(.x, !!!ddd)),
                             .i = x_copy[[.index]],
                             .before = .frame[1],
-                            .after  = .frame[2]
+                            .after  = .frame[2],
+                            .complete = .complete
                             )
                         ) %>%
+          remove_common_nested_columns(slide_output__) %>%
           tidyr::unnest_wider(slide_output__)
       }
     }
@@ -499,7 +497,8 @@ mutate = function(x, ..., .by, .order_by, .frame, .index){
                                 chunk,
                                 .f = ~ as.list(dplyr::summarise(.x, !!!ddd)),
                                 .before = .frame[1],
-                                .after  = .frame[2]
+                                .after  = .frame[2],
+                                .complete = .complete
                                 )
                           )
         } else {
@@ -510,7 +509,8 @@ mutate = function(x, ..., .by, .order_by, .frame, .index){
                                 .f = ~ as.list(dplyr::summarise(.x, !!!ddd)),
                                 .i  = chunk[[.index]],
                                 .before = .frame[1],
-                                .after  = .frame[2]
+                                .after  = .frame[2],
+                                .complete = .complete
                                 )
                           )
         }
@@ -529,6 +529,7 @@ mutate = function(x, ..., .by, .order_by, .frame, .index){
         dplyr::ungroup() %>%
         dplyr::mutate(data__ = furrr::future_map(data__, fun_per_chunk)) %>%
         tidyr::unnest(data__) %>%
+        remove_common_nested_columns(slide_output__) %>%
         tidyr::unnest_wider(slide_output__)
 
     }
@@ -542,3 +543,29 @@ mutate = function(x, ..., .by, .order_by, .frame, .index){
   return(x_copy)
 }
 
+# remove_common_nested_columns ----
+#' @name remove_common_nested_columns
+#' @title Remove non-list columns when same are present in a list column
+#' @description Remove non-list columns when same are present in a list column
+#' @param df input dataframe
+#' @param list_column Name or expr of the column which is a list of named lists
+#' @return dataframe
+remove_common_nested_columns = function(df, list_column){
+
+  # we assume that all dfs in list_column have identical column names
+  new_names = df %>%
+    dplyr::slice(1) %>%
+    dplyr::pull({{ list_column }}) %>%
+    `[[`(1) %>%
+    names()
+
+  common_names = intersect(new_names, colnames(df))
+
+  if (length(common_names) >= 1){
+    for (a_common_name in common_names){
+      df[[a_common_name]] = NULL
+    }
+  }
+
+  return(df)
+}
diff --git a/README.Rmd b/README.Rmd
@@ -19,6 +19,8 @@ devtools::load_all()
 # tidier
 
 <!-- badges: start -->
+
+[![CRAN status](https://www.r-pkg.org/badges/version/tidier)](https://CRAN.R-project.org/package=tidier)
 <!-- badges: end -->
 
 `tidier` package provides '[Apache Spark](https://spark.apache.org/)' style window aggregation for R dataframes via '[mutate](https://dplyr.tidyverse.org/reference/mutate.html)' in '[dplyr](https://dplyr.tidyverse.org/index.html)' flavour.
@@ -37,42 +39,57 @@ airquality |>
   # compute mean temperature over last seven days in the same month
   tidier::mutate(avg_temp_over_last_week = mean(Temp, na.rm = TRUE),
                  .order_by = Day,
-                 .by = Month,
-                 .frame = c(lubridate::days(7), # 7 days before current row
-                            lubridate::days(-1) # do not include current row
-                            ),
-                 .index = date_col
+                 .by       = Month,
+                 .frame    = c(lubridate::days(7), # 7 days before current row
+                               lubridate::days(-1) # do not include current row
+                               ),
+                 .index    = date_col
                  )
 ```
 
+## Features
+
+- `mutate` supports
+  - `.by` (group by),
+  - `.order_by` (order by),
+  - `.frame` (endpoints of window frame),
+  - `.index` (identify index column like date column),
+  - `.complete` (whether to compute over incomplete window).
+
+- `mutate` automatically uses a future backend (via [`furrr`](https://furrr.futureverse.org/)).
+
 ## Motivation
 
 This implementation is inspired by Apache Spark's [`windowSpec`](https://spark.apache.org/docs/3.2.1/api/python/reference/api/pyspark.sql.Column.over.html?highlight=windowspec) class with [`rangeBetween`](https://spark.apache.org/docs/3.2.1/api/python/reference/api/pyspark.sql.WindowSpec.rangeBetween.html) and [`rowsBetween`](https://spark.apache.org/docs/3.2.1/api/python/reference/api/pyspark.sql.WindowSpec.rowsBetween.html).
 
 ## Ecosystem
 
-1. [`dbplyr`](https://dbplyr.tidyverse.org/) implements this via [`dbplyr::win_over`](https://dbplyr.tidyverse.org/reference/win_over.html?q=win_over#null) enabling [`sparklyr`](https://spark.rstudio.com/) users to write window computations. Also see, [`dbplyr::window_order`/`dbplyr::window_frame`](https://dbplyr.tidyverse.org/reference/window_order.html?q=window_fr#ref-usage).
+1.  [`dbplyr`](https://dbplyr.tidyverse.org/) implements this via [`dbplyr::win_over`](https://dbplyr.tidyverse.org/reference/win_over.html?q=win_over#null) enabling [`sparklyr`](https://spark.rstudio.com/) users to write window computations. Also see, [`dbplyr::window_order`/`dbplyr::window_frame`](https://dbplyr.tidyverse.org/reference/window_order.html?q=window_fr#ref-usage).
 
-2. [`tidypyspark`](https://talegari.github.io/tidypyspark/_build/html/index.html) python package implements `mutate` style window computation API for pyspark.
+2.  [`tidypyspark`](https://talegari.github.io/tidypyspark/_build/html/index.html) python package implements `mutate` style window computation API for pyspark.
 
 ## Installation
 
-- dev: `remotes::install_github("talegari/tidier")`
-- cran: `install.packages("tidier")`
+-   dev: `remotes::install_github("talegari/tidier")`
+-   cran: `install.packages("tidier")`
 
 ## Acknowledgements
 
 `tidier` package is deeply indebted to two amazing packages and people behind it.
 
-1. [`dplyr`](https://cran.r-project.org/package=dplyr):
-```
-Wickham H, François R, Henry L, Müller K, Vaughan D (2023). _dplyr: A
-Grammar of Data Manipulation_. R package version 1.1.0,
-<https://CRAN.R-project.org/package=dplyr>.
-```
+1.  [`dplyr`](https://cran.r-project.org/package=dplyr):
 
-2. [`slider`](https://cran.r-project.org/package=slider):
+```{=html}
+<!-- -->
 ```
-Vaughan D (2021). _slider: Sliding Window Functions_. R package
-version 0.2.2, <https://CRAN.R-project.org/package=slider>.
+    Wickham H, François R, Henry L, Müller K, Vaughan D (2023). _dplyr: A
+    Grammar of Data Manipulation_. R package version 1.1.0,
+    <https://CRAN.R-project.org/package=dplyr>.
+
+2.  [`slider`](https://cran.r-project.org/package=slider):
+
+```{=html}
+<!-- -->
 ```
+    Vaughan D (2021). _slider: Sliding Window Functions_. R package
+    version 0.2.2, <https://CRAN.R-project.org/package=slider>.