自上次满足条件以来观察到的行数

时间:2019-02-07 09:40:58

标签: r dplyr

我的Data Frame类似于本示例的前三列:

id    obs   value   newCol
a     1     uncool  NA
a     2     cool    1
a     3     uncool  NA
a     4     uncool  NA
a     5     cool    2
a     6     uncool  NA
a     7     cool    1
a     8     uncool  NA
b     1     cool    0

我需要的是一列(上面的 newCol ),该列计算值“ cool”或组的第一行(按id分组)之间的观察值之间的“ uncool”数。

我该怎么做(理想情况下使用dplyr)?

4 个答案:

答案 0 :(得分:1)

除了id外,您还需要另一个分组变量,由grp = cumsum(dat$value == "cool") - (dat$value == "cool")给出,如下所示。

然后,您可以使用mutate,将sum(value == "uncool")分配给观察值,其中value == "cool"NA分别位于每个组中。

library(dplyr)
dat %>%
  group_by(id, grp = cumsum(dat$value == "cool") - (dat$value == "cool")) %>% 
  mutate(newCool = if_else(value == "cool", sum(value == "uncool"), NA_integer_))
# A tibble: 9 x 6
# Groups:   id, grp [5]
  id      obs value  newCol   grp newCool
  <chr> <int> <chr>   <int> <int>   <int>
1 a         1 uncool     NA     0      NA
2 a         2 cool        1     0       1
3 a         3 uncool     NA     1      NA
4 a         4 uncool     NA     1      NA
5 a         5 cool        2     1       2
6 a         6 uncool     NA     2      NA
7 a         7 cool        1     2       1
8 a         8 uncool     NA     3      NA
9 b         1 cool        0     3       0

数据

dat <- structure(list(id = c("a", "a", "a", "a", "a", "a", "a", "a", 
"b"), obs = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L), value = c("uncool", 
"cool", "uncool", "uncool", "cool", "uncool", "cool", "uncool", 
"cool"), newCol = c(NA, 1L, NA, NA, 2L, NA, 1L, NA, 0L)), .Names = c("id", 
"obs", "value", "newCol"), class = "data.frame", row.names = c(NA, 
-9L))

答案 1 :(得分:1)

编写简单的函数来解决您的问题:

# Your data
data <- data.frame(id = c("a", "a", "a", "a", "a", "a" ,"a" ,"a", "b"), 
                   obs = c(1,2,3,4,5,6,7,8,1), 
                   value = c("uncool", "cool", "uncool", "uncool", "cool", "uncool" ,"cool" ,"uncool", "cool"), 
                   stringsAsFactors = FALSE)

# Function for solving problem      
cool_counter <- function(vector) {
  uncool <- FALSE
  count <- 0
  results <- list()

  for(i in 1:length(vector)) {
    if(i == 1) {
      uncool <- vector[i] == "uncool"
      results[[i]] <- NA

      if(uncool) {
        count <- 1
      }
    }
    if(i > 1) {
      uncool <- vector[i] == "uncool"
      if(uncool) {
        count <- count + 1
        results[[i]] <- NA
      }
      if(!uncool) {
        results[[i]] <- count
        count <- 0
      }
    }
  }

  return(unlist(results))
} 

这给出了:

# Running function
library(dplyr)
data <- data %>%
  group_by(id) %>%
  mutate(newCol = cool_counter(value))

# Results
data
  id      obs value  newCol
  <chr> <dbl> <chr>   <dbl>
1 a         1 uncool     NA
2 a         2 cool        1
3 a         3 uncool     NA
4 a         4 uncool     NA
5 a         5 cool        2
6 a         6 uncool     NA
7 a         7 cool        1
8 a         8 uncool     NA
9 b         1 cool       NA

答案 2 :(得分:1)

我们可以创建一个辅助函数,该函数将根据.pyw=iferror(your_formula,0) 进行分组,并计算value,即

cool/uncool

给出,

cool

答案 3 :(得分:1)

我们可以通过从底部开始进行cumsum来定义组,然后使用ave为每个组构建一​​个向量:

transform(dat, newCol = ave(
  value, id, rev(cumsum(rev(value=="cool"))),
  FUN = function(x) ifelse(x=="cool", length(x)-1, NA)))
#   id obs  value newCol
# 1  a   1 uncool   <NA>
# 2  a   2   cool      1
# 3  a   3 uncool   <NA>
# 4  a   4 uncool   <NA>
# 5  a   5   cool      2
# 6  a   6 uncool   <NA>
# 7  a   7   cool      1
# 8  a   8 uncool   <NA>
# 9  b   1   cool      0

使用 dplyr

dat %>%
  group_by(id,temp = rev(cumsum(rev(value=="cool")))) %>%
  mutate(newCol = ifelse(value=="cool", n()-1, NA)) %>%
  ungroup() %>%
  select(-temp)
# # A tibble: 9 x 4
# id   obs  value newCol
#   <chr> <int>  <chr>  <dbl>
# 1     a     1 uncool     NA
# 2     a     2   cool      1
# 3     a     3 uncool     NA
# 4     a     4 uncool     NA
# 5     a     5   cool      2
# 6     a     6 uncool     NA
# 7     a     7   cool      1
# 8     a     8 uncool     NA
# 9     b     1   cool      0