保存dplyr函数中的已删除数据:如何防止冲突/覆盖

时间:2015-02-22 20:04:49

标签: r dplyr

编辑:根据要求提供更完整的解释: 如果您不了解我首先要做的事情,这将很难解释。当您进行链接时,很多时候您只想将操作集中在部分数据(特定行,汇总表,列名等)上。完成后,您经常希望将这些更改合并到原始数据中。但是,如果不破坏链条的流动,这是不可能的。这些功能可以让您这样做。但是,原始数据的副本存储在称为银行的位置。此外,为了正确地重新组合行,保存行和加载行使用一个名为index的新变量。银行和指数都可能在链条中意外修改导致问题。另外,如果存在称为bank和index的变量,该怎么办?有没有避免这些问题?

许多dplyr操作会导致数据丢失。这是一个系统,可以保存和恢复这些数据。 Save将是一个字符串,用于标识一大块已删除的数据。该数据将以列表形式存储在银行中。请注意,这是由skranz的mutate_if启发的。

我的问题是:如何更好地处理银行和指数,以免这些变量被覆盖或干扰现有变量

library(dplyr)
library(magrittr)

bank = NULL

save_rows = function(FUN)
  function(data, save, ...) {
    data$index = 1:nrow(data)
    results = data %>% FUN(...)
    bank[[save]] <<-
      data %>%
      anti_join(results %>% select(index))
    results}

load_rows = function(data, save)
  data %>% 
  bind_rows(bank[[save]]) %>%
  arrange(index) %>%
  select(-index)

save_all = function(FUN)
  function(data, save, ...) {
    bank[[save]] <<- data
    data %>% FUN(...)
  }

load_all = function(data, save)
  bank[[save]] %>% full_join(data)

change_names = function(data, save) {
  bank[[save]] <<- data
  data %>% colnames
}

attach_names_to = function(colnames, save) {
  bank[[save]] %>% 
    set_colnames(colnames)
}

#example
library(stringr)

a = c(1, 1, 2, 3)
b = c("my_momma", "my_momma", "takes_care", "of_me")
c = c("you", "you", "and", "me")

data_frame(a, b, c) %>%
  save_rows(filter)("filter", a == 1) %>%
  mutate(c = paste("baby", c, sep = "_")) %>%
  load_rows("filter") %>%
  save_rows(slice)("slice", 1) %>%
  mutate(b = "my_papa") %>%
  load_rows("slice") %>%
  group_by(a) %>%
  save_all(summarize)("summarize", n = n()) %>%
  load_all("summarize") %>%
  save_all(distinct)("distinct", a) %>%
  mutate(B = b %>% str_replace_all("_", " ")) %>%
  select(a, B) %>%
  load_all("distinct") %>%
  change_names("data") %>%
  paste("2", sep = "") %>%
  attach_names_to("data")

5 个答案:

答案 0 :(得分:2)

我没有看到你的方法优于dplyrExtras采用的方法。您的大多数代码都可以在dplyrdplyrExtras中重写。请参阅下文,了解如何在没有额外功能的情况下实现示例。

此外,使用<<-通常是个坏主意。如果您真的希望进一步发展您的方法,也许您可​​以使用与options()类似的方法。例如,请参阅?igraph:::igraph.options以获取此示例。

最后,您的重命名方式不会保留组。在您的示例中,最终分组在a之后,但数据中没有a

最后,这是您的示例的替代方案:

# to get dplyrExtras
library(devtools)
install_github(repo="skranz/dplyrExtras")
require(dplyrExtras)
# the code that does the same as your example (except the final grouping)
data_frame(a, b, c) %>%
  mutate_if(a == 1,  c = paste("baby", c, sep = "_"))  %>%
  mutate_if(1, b = "my_papa") %>%
  group_by(a) %>%
  mutate(n = n()) %>% 
  mutate(B = b %>% str_replace_all("_", " ")) %>%
  ungroup() %>%
  do(set_colnames(., paste0(colnames(.), "2"))) %>%
  group_by(a2)

答案 1 :(得分:0)

您的策略不必要地复杂化,在所有情况下都存在更简单的方法。而不是

data_frame(a, b, c) %>%
  save_rows(filter)("filter", a == 1) %>%
  mutate(c = paste("baby", c, sep = "_")) %>%
  load_rows("filter")

使用类似的东西:

d <- data_frame(a, b, c)
d %>%
  mutate(c = ifelse(a == 1, paste("baby", c, sep = "_"), c))

而不是

d %>%
  save_rows(slice)("slice", 1) %>%
  mutate(b = "my_papa") %>%
  load_rows("slice")

使用类似

的内容
d %>%
    mutate(b = ifelse(1:n() == 1, "my_papa", b))

而不是

d %>%
  group_by(a) %>%
  save_all(summarize)("summarize", n = n()) %>%
  load_all("summarize")

使用

d %>%
group_by(a) %>%
mutate(n = n())

答案 2 :(得分:0)

我仍然认为mutate_if采取的方法是可行的。但似乎你说对于某些例子来说,总结和重新合并可能会更快。我倾向于使用类似mutate_if的函数来处理这种情况。下面我展示了mutate_group函数的方法以及一些基准测试。

require(dplyr)
# mutate_group function
mutate_group <- function(df, ...){
  .dots = lazyeval::lazy_dots(...)
  groups <- groups(df)
  # grouping variables defined
  mdf <- df %>% 
    mutate_(.dots = setNames(names(.dots), paste0(".group.", names(.dots))))
  mdf %>% 
    group_by_(.dots = paste0(".group.", names(.dots))) %>% 
    summarize_() %>% 
    ungroup %>%
    mutate_(.dots=.dots) %>% 
    right_join(mdf, by=paste0(".group.", names(.dots))) %>% 
    select_(.dots = c(paste0("-.group.", names(.dots)), paste0("-", names(.dots), ".y"))) %>%
    rename_(.dots = setNames(paste0(names(.dots), ".x"), names(.dots)))   %>%
    group_by_(.dots=groups)
}

现在进行基准测试:

require(microbenchmark)
# testing 1
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE), 
                 y = runif(1e6))
#  
microbenchmark(
  r1 <- df %>% mutate(x = gsub('a', 't', x))
  ,
  r2 <- df %>% mutate_group(x = gsub('a', 't', .group.x))
)
#
## Unit: milliseconds
##                                                             expr      min       lq     mean   median       uq      max neval cld
## r1 <- df %>% mutate(x = gsub("a", "t", x))              324.9036 328.7171 337.6389 330.2874 345.2245 415.6200   100   b
## r2 <- df %>% mutate_group(x = gsub("a", "t", .group.x)) 117.0220 120.1766 128.9403 121.8053 135.4410 208.5801   100  a 
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE
#
# testing 2
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE), 
                 y = sample(letters[1:4], 1e6, replace=TRUE), 
                 z = runif(1e6))
microbenchmark(
  r1 <- df %>% mutate(x = gsub('a', 't', x), 
                      y = gsub('b', 's', y))
  ,
  r2 <- df %>% mutate_group(x = gsub('a', 't', .group.x), 
                            y = gsub('b', 's', .group.y))
)
## Unit: milliseconds
##                                                                                       expr      min       lq     mean   median       uq      max neval cld
## r1 <- df %>% mutate(x = gsub("a", "t", x), y = gsub("b", "s",      y))                     665.9306 674.2292 691.7966 682.0874 695.8887 776.9765   100   b
## r2 <- df %>% mutate_group(x = gsub("a", "t", .group.x), y = gsub("b",      "s", .group.y)) 150.9971 156.5910 177.6797 171.7907 177.9938 279.1329   100  a 
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE
#
# testing 3
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE), 
                 y = sample(letters[1:4], 1e6, replace=TRUE), 
                 z = runif(1e6))
#  
microbenchmark(
  r1 <- df %>% group_by(y) %>% mutate(x = gsub('a', 't', x))
  ,
  r2 <- df %>% group_by(y) %>% mutate_group(x = gsub('a', 't', .group.x))
)
#
## Unit: milliseconds
##                                                                         expr      min       lq     mean   median       uq      max neval cld
## r1 <- df %>% group_by(y) %>% mutate(x = gsub("a", "t", x))                   399.8765 405.0650 415.0338 408.3451 423.2523 494.0247   100   b
## r2 <- df %>% group_by(y) %>% mutate_group(x = gsub("a", "t",      .group.x)) 224.0281 231.9769 247.7521 244.8441 248.5926 319.9048   100  a 
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE

答案 3 :(得分:0)

好的,这是一个新版本。语法简化了。我仍然没有通过意外干扰解决这个问题。

library(stringr)
library(stringi)
library(dplyr)
library(dplyrExtras)
library(pipeR)

bank = list("dummy" = NULL)

oldBank = function() length(bank)
newBank = function() length(bank) + 1

oldIndex = function() paste("index", oldBank(), sep = "")
newIndex = function() paste("index", newBank(), sep = "")

selectOldIndex = function(data) data %>>%
  select_(oldIndex())
removeOldIndex = function(data) data %>>%
  select_(paste("-", oldIndex()))

focus = function(data) {
  data[[newIndex()]] = 1:nrow(data)
  bank[[newBank()]] <<- data
  data
}

restore = function(zoom) {
  restoreData = 
    bank[[oldBank()]] %>>%
    removeOldIndex %>>%
    left_join(zoom)
  bank[[oldBank()]] <<-NULL
  restoreData
}

restore_rows = function(zoom) {
  restoreData = 
    bank[[oldBank()]] %>>%
    anti_join(zoom %>>% selectOldIndex()) %>>%
    bind_rows(zoom) %>>%
    removeOldIndex()
  bank[[oldBank()]] <<-NULL
  restoreData
}

#example

options(stringsAsFactors = FALSE)

characters = c("1st", "2nd", "3rd", "other_value") %>>%
  rep(10) %>>%
  (data.frame(value = ., type = "character"))
numerics = c("1", "2", "3", ".") %>>%
  rep(10) %>>%
  (data.frame(value = ., type = "numeric"))
data = bind_rows(characters, numerics)

abbrevs = data_frame(
  value = c("1st", "2nd", "3rd"),
  full = c("first_value", "second_value", "third_value"))

results =
  data %>>%
  focus %>>%
    group_by(value) %>>%
    summarize(n = nrow(.), type = first(type)) %>>%
    focus %>>%
      filter(type == "character") %>>%
      left_join(abbrevs) %>>%
      mutate_if(is.na(full), full = value) %>>%
      mutate(full = full %>>%
               str_replace_all("_", " ") %>>%
               stri_trans_totitle()) %>>%
    restore_rows %>>%
  restore %>>%
  mutate_if(!is.na(full), value = full) %>>%
  select(-full)

答案 4 :(得分:0)

好的,这是第3版。我通过允许用户为银行和索引指定自己的名称来解决干扰问题。

library(stringr)
library(stringi)
library(dplyr)
library(dplyrExtras)
library(pipeR)
library(lazyeval)
library(gtools)

construct_bank_index = defmacro(bank, index, expr = {

  bank = list("NULL" = NULL)

  old_bank = function() length(bank)
  new_bank = function() length(bank) + 1

  old_index = function() paste(index, old_bank(), sep = "")
  newIndex = function() paste(index, new_bank(), sep = "")

  select_old_index = function(data) data %>>%
    select_(old_index())
  remove_old_index = function(data) data %>>%
    select_(paste("-", old_index()))

  focus = function(data) {
    data[[newIndex()]] = 1:nrow(data)
    bank[[new_bank()]] <<- data
    data
  }

  restore = function(zoom) {
    restoreData = 
      bank[[old_bank()]] %>>%
      remove_old_index %>>%
      left_join(zoom)
    bank[[old_bank()]] <<-NULL
    restoreData
  }

  restore_rows = function(zoom) {
    restoreData = 
      bank[[old_bank()]] %>>%
      anti_join(zoom %>>% select_old_index()) %>>%
      bind_rows(zoom) %>>%
      remove_old_index()
    bank[[old_bank()]] <<-NULL
    restoreData
  }
})

#example

construct_bank_index(bank, "index")

options(stringsAsFactors = FALSE)

characters = c("1st", "2nd", "3rd", "other_value") %>>%
  rep(10) %>>%
  (data.frame(value = ., type = "character"))
numerics = c("1", "2", "3", ".") %>>%
  rep(10) %>>%
  (data.frame(value = ., type = "numeric"))
data = bind_rows(characters, numerics)

abbrevs = data_frame(
  value = c("1st", "2nd", "3rd"),
  full = c("first_value", "second_value", "third_value"))

results =
  data %>>%
  focus %>>%
    group_by(value) %>>%
    summarize(n = n(), type = first(type)) %>>%
    focus %>>%
      filter(type == "character") %>>%
      left_join(abbrevs) %>>%
      mutate_if(is.na(full), full = value) %>>%
      mutate(full = full %>>%
               str_replace_all("_", " ") %>>%
               stri_trans_totitle()) %>>%
    restore_rows %>>%
  restore %>>%
  mutate_if(!is.na(full), value = full) %>>%
  select(-full)