Question

这个问题几乎等同于： Create new group based on cumulative sum and group

但是，当我将接受的解决方案应用于我的数据时，它没有预期的结果。

简而言之，我有一个包含两个变量的数据：domain和value。 Domain是具有多个观察值的组变量，而value是我想通过domain累积的一些连续值，并且是一个新的组变量newgroup。有三个主要规则：

我仅在每个domain中累积。如果我到达domain的末尾，那么累积将重置。
如果累计总和至少为1.0，则将其值总计至少为1.0的观测值分配给group1的不同值。请注意，一次观察即可满足此规则。
如果domain中的最后一组的总和小于1.0，则将其与同一domain中倒数第二个组合并。这反映在变量group2

以下数据已简化。数据通常包含10 ^ 5-10 ^ 6行，因此矢量化的解决方案将是理想的。

示例数据

domain <- c(rep(1,5),rep(2,8))
value <- c(1,0,2,2.5,0.1,0.1,0.5,0,0.2,0.6,0,0,0.1)
df_raw <- data.frame(domain,value)


 domain value
      1   1.0
      1   0.0
      1   2.0
      1   2.5
      1   0.1
      2   0.1
      2   0.5
      2   0.0
      2   0.2
      2   0.6
      2   0.0
      2   0.0
      2   0.1

所需的输出

cumsum_val <- c(1,0,2,2.5,0.1,0.1,0.6,0.6,0.8,1.4,0,0,0.1)
group1 <- c(1,2,2,3,4,5,5,5,5,5,6,6,6)
group2 <- c(1,2,2,3,3,4,4,4,4,4,4,4,4) #Satisfies Rule #3
df_want <- data.frame(domain,value,cumsum_val,group1,group2)

 domain value cumsum_val group1 group2
      1   1.0        1.0      1      1
      1   0.0        0.0      2      2
      1   2.0        2.0      2      2
      1   2.5        2.5      3      3
      1   0.1        0.1      4      3
      2   0.1        0.1      5      4
      2   0.5        0.6      5      4
      2   0.0        0.6      5      4
      2   0.2        0.8      5      4
      2   0.6        1.4      5      4
      2   0.0        0.0      6      4
      2   0.0        0.0      6      4
      2   0.1        0.1      6      4

我使用了以下代码：

sum0 <- function(x, y) { if (x + y >= 1.0) 0 else x + y }
is_start <- function(x) head(c(TRUE, Reduce(sum0, init=0, x, acc = TRUE)[-1] == 0), -1)
cumsum(ave(df_raw$value, df_raw$domain, FUN = is_start))
## 1 2 3 4 5 6 6 6 6 6 7 8 9

，但最后一行与上面的group1产生的值不同。产生group1输出是导致我出现问题的主要原因。有人可以帮助我理解功能is_start以及该功能如何产生分组吗？

编辑 akrun在上面简化示例的注释中提供了一些工作代码。但是，在某些情况下它不起作用。例如，

domain <- c(rep(1,7),rep(2,8))
value <- c(1,0,1,0,2,2.5,0.1,0.1,0.5,0,0.2,0.6,0,0,0.1)
df_raw <- data.frame(domain,value)

下面显示了输出，其中new来自akrun的代码，而group1和group2是基于规则＃2和＃3的期望分组。 new和group2之间的差异主要发生在前3行中。

 domain value new group1 group2
      1   1.0   1      1      1
      1   0.0   2      2      2
      1   1.0   3      2      2
      1   0.0   4      3      3
      1   2.0   4      3      3
      1   2.5   5      4      4
      1   0.1   5      5      4
      2   0.1   6      6      5
      2   0.5   6      6      5
      2   0.0   6      6      5
      2   0.2   6      6      5
      2   0.6   6      6      5
      2   0.0   6      7      5
      2   0.0   6      7      5
      2   0.1   6      7      5

编辑2 我已经更新了工作答案。

Answer 1

这有效！它结合使用purrr的accumulate（类似于cumsum，但用途更多）和cumsum并适当地使用group_by来获得您想要的东西。我添加了注释以指示每个部分在做什么。我会注意到next_group2有点用词不当-更多的是not_next_group2，但希望其余的都清楚。

library(tidyverse)

domain <- c(rep(1,5),rep(2,8))
value <- c(1,0,2,2.5,0.1,0.1,0.5,0,0.2,0.6,0,0,0.1)
df_raw <- data.frame(domain,value)

## Modified from: https://stackoverflow.com/questions/49076769/dplyr-r-cumulative-sum-with-reset
sum_reset_at = function(val_col, threshold, include.equals = TRUE) {
  if (include.equals) {
    purrr::accumulate({{val_col}}, ~if_else(.x>=threshold , .y, .x+.y))
  } else {
    purrr::accumulate({{val_col}}, ~if_else(.x>threshold , .y, .x+.y))
  }
}

df_raw %>% 
  group_by(domain) %>% 
  mutate(cumsum_val = sum_reset_at(value, 1)) %>% 
  mutate(next_group1 = ifelse(lag(cumsum_val) >= 1 | row_number() == 1, 1, 0)) %>% ## binary interpretation of whether there should be a new group
  ungroup %>% 
  mutate(group1 = cumsum(next_group1)) %>% ## generate new groups
  group_by(domain, group1) %>%
  mutate(next_group2 = ifelse(max(cumsum_val) < 1 & row_number() == 1, 1, 0)) %>% ## similar to above, but grouped by your new group1; we ask it only to transition at the first value of the group that doesn't reach 1
  ungroup %>% 
  mutate(group2 = cumsum(next_group1 - next_group2)) %>% ## cancel out the next_group1 binary if it meets the conditions of next_group2
  select(-starts_with("next_"))

并按规定产生：

# A tibble: 13 x 5
   domain value cumsum_val group1 group2
    <dbl> <dbl>      <dbl>  <dbl>  <dbl>
 1      1   1          1        1      1
 2      1   0          0        2      2
 3      1   2          2        2      2
 4      1   2.5        2.5      3      3
 5      1   0.1        0.1      4      3
 6      2   0.1        0.1      5      4
 7      2   0.5        0.6      5      4
 8      2   0          0.6      5      4
 9      2   0.2        0.8      5      4
10      2   0.6        1.4      5      4
11      2   0          0        6      4
12      2   0          0        6      4
13      2   0.1        0.1      6      4

Answer 2

下面的解决方案改编自Group vector on conditional sum。

助手Rcpp功能

library(Rcpp)
cppFunction('
IntegerVector CreateGroup(NumericVector x, int cutoff) {
    IntegerVector groupVec (x.size());
    int group = 1;
    int threshid = 0;
    double runSum = 0;
    for (int i = 0; i < x.size(); i++) {
        runSum += x[i];
        groupVec[i] = group;

        if (runSum >= cutoff) {
            group++;
            runSum = 0;
        }
    }
    return groupVec;
}
')

主要功能

domain <- c(rep(1,7),rep(2,8))
value <- c(1,0,1,0,2,2.5,0.1,0.1,0.5,0,0.2,0.6,0,0,0.1)
df_raw <- data.frame(domain,value)

df_raw %>%
  group_by(domain) %>%
  mutate(group1 = CreateGroup(value,1),
         group1 = ifelse(group1==max(group1) & last(value) < 1,
                        max(group1)-1,group1)) %>%
  ungroup() %>%
  mutate(group2 = rleid(group1))

 domain value group1 group2
      1   1.0      1      1
      1   0.0      2      2
      1   1.0      2      2
      1   0.0      3      3
      1   2.0      3      3
      1   2.5      4      4
      1   0.1      4      4
      2   0.1      1      5
      2   0.5      1      5
      2   0.0      1      5
      2   0.2      1      5
      2   0.6      1      5
      2   0.0      1      5
      2   0.0      1      5
      2   0.1      1      5

根据累计总和和其他分组创建分组

2 个答案: