我有一个具有两种类型值的数据框。我想分组切片。 预计这些小组将提供两个条件。每个小组应该是;
如果其中一个标准达到最大累计值,则应重置累计总和 然后重新开始。
id<- sample(1:33)
w <- c(2,1,32,5,1,1,12,1,2,32,32,32,1,3,2,12,1,1,1,1,1,1,5,3,5,1,1,1,2,7,2,32,1)
n <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)
df <- data.frame(id, w, n)
预期结果(手动制作)
w cumsum_w n cumsum_n group
2 2 1 1 1
1 3 1 2 1
32 35 1 3 1
5 40 1 4 1
1 41 1 5 1
1 42 1 6 1
12 54 1 7 1
1 55 1 8 1
2 57 1 9 1
32 32 1 2 2
32 64 1 3 2
32 32 1 1 3
1 33 1 2 3
3 36 1 3 3
2 38 1 4 3
12 50 1 5 3
1 51 1 6 3
1 52 1 7 3
1 53 1 8 3
1 54 1 9 3
1 55 1 10 3
1 56 1 11 3
5 61 1 12 3
3 64 1 13 3
5 69 1 14 3
1 70 1 15 3
1 1 1 1 4
1 2 1 2 4
2 4 1 3 4
7 11 1 4 4
2 13 1 5 4
32 45 1 6 4
1 46 1 7 4
我试图解决一些方法:
library(BBmisc)
chunk(df, chunk.size = 75, n.chunks = 15)
Error in chunk(df, chunk.size = 75, n.chunks = 15) :
You must provide exactly one of 'chunk.size', 'n.chunks' or 'props'
cumsum_with_reset_group <- function(w, n, threshold_w, threshold_n) {
cumsum_w <- 0
cumsum_n <- 0
group <- 1
result <- numeric()
for (i in 1:length(w)) {
cumsum_w <- cumsum_w + w[i]
cumsum_n <- cumsum_n + n[i]
if (cumsum_w > threshold_w | cumsum_n > threshold_n) {
group <- group + 1
cumsum_w <- cumsum_w + w[i]
cumsum_n <- cumsum_n + n[i]
}
result = c(result, group)
}
return (result)
}
# cumsum with reset
cumsum_w_with_reset <- function(w, threshold_w) {
cumsum_w <- 0
group <- 1
result <- numeric()
for (i in 1:length(w)) {
cumsum_w <- cumsum_w + w[i]
if (cumsum_w > threshold_w) {
group <- group + 1
cumsum_w <- w[i]
}
result = c(result, cumsum_w)
}
return (result)
}
# cumsum with reset
cumsum_n_with_reset <- function(n, threshold_n) {
cumsum_n <- 0
group <- 1
result <- numeric()
for (i in 1:length(n)) {
cumsum_n <- cumsum_n + n[i]
if (cumsum_n > threshold_n | cumsum_w > threshold_w) {
group <- group + 1
cumsum_n <- n[i]
}
result = c(result, cumsum_n)
}
return (result)
}
# use functions above as window functions inside mutate statement
y<-df %>% group_by() %>%
mutate(
cumsum_w = cumsum_w_with_reset(w, 75),
cumsum_n =cumsum_n_with_reset(n, 15),
group = cumsum_with_reset_group(w, n, 75, 15)
) %>%
ungroup()
Error in mutate_impl(.data, dots) :
Evaluation error: object 'cumsum_w' not found
谢谢!
答案 0 :(得分:1)
这是一个hack,它通过重复的子集和绑定来完成。因此,对于大型数据集,这将非常慢。这将整个数据集作为输入。
library(dplyr)
cumsumdf <- function(df){
cumsum_75 <- function(x) {cumsum(x) %/% 76}
cumsum_15 <- function(x) {cumsum(x) %/% 16}
cumsum_w75 <- function(x) {cumsum(x) %% 76}
cumsum_n15 <- function(x) {cumsum(x) %% 16}
m <- nrow(df)
df$grp <- 0
df <- df %>%
group_by(grp) %>%
mutate(cumsum_w = numeric(m), cumsum_n = numeric(m))
n = 0
df2 <- df[0,]
while(nrow(df) >0 ){
df$cumsum_w = cumsum_75(df$w)
df$cumsum_n = cumsum_15(df$n)
n <- n + 1
df1 <- df[df$cumsum_n == 0 & df$cumsum_w == 0,]
df <- df[df$cumsum_n != 0 | df$cumsum_w != 0,]
df1$grp <- n
df1 <- df1 %>% group_by(grp) %>%
mutate(cumsum_w = cumsum_w75(w), cumsum_n = cumsum_n15(n))
df2 <- rbind(df2,df1)
}
return(df2)
}
cumsumdf(df)