方法1

Question

我有一个具有两种类型值的数据框。我想分组切片。预计这些小组将提供两个条件。每个小组应该是;

条件1：w <= 75
条件1：n <= 15

如果其中一个标准达到最大累计值，则应重置累计总和然后重新开始。

id<- sample(1:33)
w <- c(2,1,32,5,1,1,12,1,2,32,32,32,1,3,2,12,1,1,1,1,1,1,5,3,5,1,1,1,2,7,2,32,1)
n <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)
df  <- data.frame(id, w, n)

预期结果（手动制作）

w cumsum_w n cumsum_n group
2     2     1   1   1
1     3     1   2   1
32    35    1   3   1
5     40    1   4   1
1     41    1   5   1
1     42    1   6   1
12    54    1   7   1
1     55    1   8   1
2     57    1   9   1
32    32    1   2   2
32    64    1   3   2
32    32    1   1   3
1     33    1   2   3
3     36    1   3   3
2     38    1   4   3
12    50    1   5   3
1     51    1   6   3
1     52    1   7   3
1     53    1   8   3
1     54    1   9   3
1     55    1   10  3
1     56    1   11  3
5     61    1   12  3
3     64    1   13  3
5     69    1   14  3
1     70    1   15  3
1     1     1   1   4
1     2     1   2   4
2     4     1   3   4
7     11    1   4   4
2     13    1   5   4
32    45    1   6   4
1     46    1   7   4

我试图解决一些方法：

方法1

library(BBmisc)
chunk(df, chunk.size = 75, n.chunks = 15)
Error in chunk(df, chunk.size = 75, n.chunks = 15) : 
  You must provide exactly one of 'chunk.size', 'n.chunks' or 'props'

方法2

cumsum_with_reset_group <- function(w, n, threshold_w, threshold_n) {
  cumsum_w <- 0
  cumsum_n <- 0
  group <- 1
  result <- numeric()

  for (i in 1:length(w)) {
    cumsum_w <- cumsum_w + w[i]
    cumsum_n <- cumsum_n + n[i]

    if (cumsum_w > threshold_w | cumsum_n > threshold_n) {
      group <- group + 1
      cumsum_w <- cumsum_w + w[i]
      cumsum_n <- cumsum_n + n[i]
    }

    result = c(result, group)

  }

  return (result)
}

# cumsum with reset
cumsum_w_with_reset <- function(w, threshold_w) {
  cumsum_w <- 0
  group <- 1
  result <- numeric()

  for (i in 1:length(w)) {
    cumsum_w <- cumsum_w + w[i]

    if (cumsum_w > threshold_w) {
      group <- group + 1
      cumsum_w <- w[i]
    }

    result = c(result, cumsum_w)

  }

  return (result)
}


# cumsum with reset
cumsum_n_with_reset <- function(n, threshold_n) {
  cumsum_n <- 0
  group <- 1
  result <- numeric()

  for (i in 1:length(n)) {
    cumsum_n <- cumsum_n + n[i]

    if (cumsum_n > threshold_n | cumsum_w > threshold_w) {
      group <- group + 1
      cumsum_n <- n[i]
    }

    result = c(result, cumsum_n)

  }

  return (result)
}
# use functions above as window functions inside mutate statement
y<-df %>% group_by() %>%
  mutate(
    cumsum_w = cumsum_w_with_reset(w, 75),
    cumsum_n  =cumsum_n_with_reset(n, 15),
    group = cumsum_with_reset_group(w, n, 75, 15)
  ) %>% 
  ungroup()

    Error in mutate_impl(.data, dots) : 
      Evaluation error: object 'cumsum_w' not found

谢谢！

Answer 1

这是一个hack，它通过重复的子集和绑定来完成。因此，对于大型数据集，这将非常慢。这将整个数据集作为输入。

library(dplyr)

cumsumdf <- function(df){
  cumsum_75 <- function(x) {cumsum(x) %/% 76}
  cumsum_15 <- function(x) {cumsum(x) %/% 16}  
  cumsum_w75 <- function(x) {cumsum(x) %% 76}
  cumsum_n15 <- function(x) {cumsum(x) %% 16}

  m <- nrow(df)

  df$grp <- 0
  df <- df %>%
    group_by(grp) %>%
    mutate(cumsum_w = numeric(m), cumsum_n = numeric(m))

  n = 0
  df2 <- df[0,]
  while(nrow(df) >0 ){
    df$cumsum_w = cumsum_75(df$w)
    df$cumsum_n = cumsum_15(df$n)

    n <- n + 1
    df1 <- df[df$cumsum_n == 0 & df$cumsum_w == 0,]
    df <- df[df$cumsum_n != 0 | df$cumsum_w != 0,]
    df1$grp <- n  
    df1 <- df1 %>% group_by(grp) %>%
      mutate(cumsum_w = cumsum_w75(w), cumsum_n = cumsum_n15(n))
    df2 <- rbind(df2,df1)
  }
  return(df2)
}

cumsumdf(df)

在R

方法1

方法2

1 个答案: