编辑:根据要求提供更完整的解释: 如果您不了解我首先要做的事情,这将很难解释。当您进行链接时,很多时候您只想将操作集中在部分数据(特定行,汇总表,列名等)上。完成后,您经常希望将这些更改合并到原始数据中。但是,如果不破坏链条的流动,这是不可能的。这些功能可以让您这样做。但是,原始数据的副本存储在称为银行的位置。此外,为了正确地重新组合行,保存行和加载行使用一个名为index的新变量。银行和指数都可能在链条中意外修改导致问题。另外,如果存在称为bank和index的变量,该怎么办?有没有避免这些问题?
许多dplyr操作会导致数据丢失。这是一个系统,可以保存和恢复这些数据。 Save将是一个字符串,用于标识一大块已删除的数据。该数据将以列表形式存储在银行中。请注意,这是由skranz的mutate_if启发的。
我的问题是:如何更好地处理银行和指数,以免这些变量被覆盖或干扰现有变量
library(dplyr)
library(magrittr)
bank = NULL
save_rows = function(FUN)
function(data, save, ...) {
data$index = 1:nrow(data)
results = data %>% FUN(...)
bank[[save]] <<-
data %>%
anti_join(results %>% select(index))
results}
load_rows = function(data, save)
data %>%
bind_rows(bank[[save]]) %>%
arrange(index) %>%
select(-index)
save_all = function(FUN)
function(data, save, ...) {
bank[[save]] <<- data
data %>% FUN(...)
}
load_all = function(data, save)
bank[[save]] %>% full_join(data)
change_names = function(data, save) {
bank[[save]] <<- data
data %>% colnames
}
attach_names_to = function(colnames, save) {
bank[[save]] %>%
set_colnames(colnames)
}
#example
library(stringr)
a = c(1, 1, 2, 3)
b = c("my_momma", "my_momma", "takes_care", "of_me")
c = c("you", "you", "and", "me")
data_frame(a, b, c) %>%
save_rows(filter)("filter", a == 1) %>%
mutate(c = paste("baby", c, sep = "_")) %>%
load_rows("filter") %>%
save_rows(slice)("slice", 1) %>%
mutate(b = "my_papa") %>%
load_rows("slice") %>%
group_by(a) %>%
save_all(summarize)("summarize", n = n()) %>%
load_all("summarize") %>%
save_all(distinct)("distinct", a) %>%
mutate(B = b %>% str_replace_all("_", " ")) %>%
select(a, B) %>%
load_all("distinct") %>%
change_names("data") %>%
paste("2", sep = "") %>%
attach_names_to("data")
答案 0 :(得分:2)
我没有看到你的方法优于dplyrExtras
采用的方法。您的大多数代码都可以在dplyr
或dplyrExtras
中重写。请参阅下文,了解如何在没有额外功能的情况下实现示例。
此外,使用<<-
通常是个坏主意。如果您真的希望进一步发展您的方法,也许您可以使用与options()
类似的方法。例如,请参阅?igraph:::igraph.options
以获取此示例。
最后,您的重命名方式不会保留组。在您的示例中,最终分组在a
之后,但数据中没有a
。
最后,这是您的示例的替代方案:
# to get dplyrExtras
library(devtools)
install_github(repo="skranz/dplyrExtras")
require(dplyrExtras)
# the code that does the same as your example (except the final grouping)
data_frame(a, b, c) %>%
mutate_if(a == 1, c = paste("baby", c, sep = "_")) %>%
mutate_if(1, b = "my_papa") %>%
group_by(a) %>%
mutate(n = n()) %>%
mutate(B = b %>% str_replace_all("_", " ")) %>%
ungroup() %>%
do(set_colnames(., paste0(colnames(.), "2"))) %>%
group_by(a2)
答案 1 :(得分:0)
您的策略不必要地复杂化,在所有情况下都存在更简单的方法。而不是
data_frame(a, b, c) %>%
save_rows(filter)("filter", a == 1) %>%
mutate(c = paste("baby", c, sep = "_")) %>%
load_rows("filter")
使用类似的东西:
d <- data_frame(a, b, c)
d %>%
mutate(c = ifelse(a == 1, paste("baby", c, sep = "_"), c))
而不是
d %>%
save_rows(slice)("slice", 1) %>%
mutate(b = "my_papa") %>%
load_rows("slice")
使用类似
的内容d %>%
mutate(b = ifelse(1:n() == 1, "my_papa", b))
而不是
d %>%
group_by(a) %>%
save_all(summarize)("summarize", n = n()) %>%
load_all("summarize")
使用
d %>%
group_by(a) %>%
mutate(n = n())
答案 2 :(得分:0)
我仍然认为mutate_if
采取的方法是可行的。但似乎你说对于某些例子来说,总结和重新合并可能会更快。我倾向于使用类似mutate_if
的函数来处理这种情况。下面我展示了mutate_group
函数的方法以及一些基准测试。
require(dplyr)
# mutate_group function
mutate_group <- function(df, ...){
.dots = lazyeval::lazy_dots(...)
groups <- groups(df)
# grouping variables defined
mdf <- df %>%
mutate_(.dots = setNames(names(.dots), paste0(".group.", names(.dots))))
mdf %>%
group_by_(.dots = paste0(".group.", names(.dots))) %>%
summarize_() %>%
ungroup %>%
mutate_(.dots=.dots) %>%
right_join(mdf, by=paste0(".group.", names(.dots))) %>%
select_(.dots = c(paste0("-.group.", names(.dots)), paste0("-", names(.dots), ".y"))) %>%
rename_(.dots = setNames(paste0(names(.dots), ".x"), names(.dots))) %>%
group_by_(.dots=groups)
}
现在进行基准测试:
require(microbenchmark)
# testing 1
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE),
y = runif(1e6))
#
microbenchmark(
r1 <- df %>% mutate(x = gsub('a', 't', x))
,
r2 <- df %>% mutate_group(x = gsub('a', 't', .group.x))
)
#
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## r1 <- df %>% mutate(x = gsub("a", "t", x)) 324.9036 328.7171 337.6389 330.2874 345.2245 415.6200 100 b
## r2 <- df %>% mutate_group(x = gsub("a", "t", .group.x)) 117.0220 120.1766 128.9403 121.8053 135.4410 208.5801 100 a
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE
#
# testing 2
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE),
y = sample(letters[1:4], 1e6, replace=TRUE),
z = runif(1e6))
microbenchmark(
r1 <- df %>% mutate(x = gsub('a', 't', x),
y = gsub('b', 's', y))
,
r2 <- df %>% mutate_group(x = gsub('a', 't', .group.x),
y = gsub('b', 's', .group.y))
)
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## r1 <- df %>% mutate(x = gsub("a", "t", x), y = gsub("b", "s", y)) 665.9306 674.2292 691.7966 682.0874 695.8887 776.9765 100 b
## r2 <- df %>% mutate_group(x = gsub("a", "t", .group.x), y = gsub("b", "s", .group.y)) 150.9971 156.5910 177.6797 171.7907 177.9938 279.1329 100 a
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE
#
# testing 3
set.seed(1)
df <- data.frame(x = sample(letters[1:4], 1e6, replace=TRUE),
y = sample(letters[1:4], 1e6, replace=TRUE),
z = runif(1e6))
#
microbenchmark(
r1 <- df %>% group_by(y) %>% mutate(x = gsub('a', 't', x))
,
r2 <- df %>% group_by(y) %>% mutate_group(x = gsub('a', 't', .group.x))
)
#
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## r1 <- df %>% group_by(y) %>% mutate(x = gsub("a", "t", x)) 399.8765 405.0650 415.0338 408.3451 423.2523 494.0247 100 b
## r2 <- df %>% group_by(y) %>% mutate_group(x = gsub("a", "t", .group.x)) 224.0281 231.9769 247.7521 244.8441 248.5926 319.9048 100 a
#
all.equal(r1, r2, check.attributes = FALSE)
## [1] TRUE
答案 3 :(得分:0)
好的,这是一个新版本。语法简化了。我仍然没有通过意外干扰解决这个问题。
library(stringr)
library(stringi)
library(dplyr)
library(dplyrExtras)
library(pipeR)
bank = list("dummy" = NULL)
oldBank = function() length(bank)
newBank = function() length(bank) + 1
oldIndex = function() paste("index", oldBank(), sep = "")
newIndex = function() paste("index", newBank(), sep = "")
selectOldIndex = function(data) data %>>%
select_(oldIndex())
removeOldIndex = function(data) data %>>%
select_(paste("-", oldIndex()))
focus = function(data) {
data[[newIndex()]] = 1:nrow(data)
bank[[newBank()]] <<- data
data
}
restore = function(zoom) {
restoreData =
bank[[oldBank()]] %>>%
removeOldIndex %>>%
left_join(zoom)
bank[[oldBank()]] <<-NULL
restoreData
}
restore_rows = function(zoom) {
restoreData =
bank[[oldBank()]] %>>%
anti_join(zoom %>>% selectOldIndex()) %>>%
bind_rows(zoom) %>>%
removeOldIndex()
bank[[oldBank()]] <<-NULL
restoreData
}
#example
options(stringsAsFactors = FALSE)
characters = c("1st", "2nd", "3rd", "other_value") %>>%
rep(10) %>>%
(data.frame(value = ., type = "character"))
numerics = c("1", "2", "3", ".") %>>%
rep(10) %>>%
(data.frame(value = ., type = "numeric"))
data = bind_rows(characters, numerics)
abbrevs = data_frame(
value = c("1st", "2nd", "3rd"),
full = c("first_value", "second_value", "third_value"))
results =
data %>>%
focus %>>%
group_by(value) %>>%
summarize(n = nrow(.), type = first(type)) %>>%
focus %>>%
filter(type == "character") %>>%
left_join(abbrevs) %>>%
mutate_if(is.na(full), full = value) %>>%
mutate(full = full %>>%
str_replace_all("_", " ") %>>%
stri_trans_totitle()) %>>%
restore_rows %>>%
restore %>>%
mutate_if(!is.na(full), value = full) %>>%
select(-full)
答案 4 :(得分:0)
好的,这是第3版。我通过允许用户为银行和索引指定自己的名称来解决干扰问题。
library(stringr)
library(stringi)
library(dplyr)
library(dplyrExtras)
library(pipeR)
library(lazyeval)
library(gtools)
construct_bank_index = defmacro(bank, index, expr = {
bank = list("NULL" = NULL)
old_bank = function() length(bank)
new_bank = function() length(bank) + 1
old_index = function() paste(index, old_bank(), sep = "")
newIndex = function() paste(index, new_bank(), sep = "")
select_old_index = function(data) data %>>%
select_(old_index())
remove_old_index = function(data) data %>>%
select_(paste("-", old_index()))
focus = function(data) {
data[[newIndex()]] = 1:nrow(data)
bank[[new_bank()]] <<- data
data
}
restore = function(zoom) {
restoreData =
bank[[old_bank()]] %>>%
remove_old_index %>>%
left_join(zoom)
bank[[old_bank()]] <<-NULL
restoreData
}
restore_rows = function(zoom) {
restoreData =
bank[[old_bank()]] %>>%
anti_join(zoom %>>% select_old_index()) %>>%
bind_rows(zoom) %>>%
remove_old_index()
bank[[old_bank()]] <<-NULL
restoreData
}
})
#example
construct_bank_index(bank, "index")
options(stringsAsFactors = FALSE)
characters = c("1st", "2nd", "3rd", "other_value") %>>%
rep(10) %>>%
(data.frame(value = ., type = "character"))
numerics = c("1", "2", "3", ".") %>>%
rep(10) %>>%
(data.frame(value = ., type = "numeric"))
data = bind_rows(characters, numerics)
abbrevs = data_frame(
value = c("1st", "2nd", "3rd"),
full = c("first_value", "second_value", "third_value"))
results =
data %>>%
focus %>>%
group_by(value) %>>%
summarize(n = n(), type = first(type)) %>>%
focus %>>%
filter(type == "character") %>>%
left_join(abbrevs) %>>%
mutate_if(is.na(full), full = value) %>>%
mutate(full = full %>>%
str_replace_all("_", " ") %>>%
stri_trans_totitle()) %>>%
restore_rows %>>%
restore %>>%
mutate_if(!is.na(full), value = full) %>>%
select(-full)