指示首次出现特定值的新变量

时间:2018-11-26 13:12:43

标签: r dplyr panel transformation hierarchical-data

我想创建一个新变量,该变量指示对该变量的值的首次特定观察。

在下面的示例数据集中,我希望有一个新变量“ firstna”,对于该玩家的第一次“ NA”观察值为“ 1”。

game_data <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))

game_data
  player level points
1      1     1     20
2      1     2     NA
3      1     3     NA
4      1     4     NA
5      2     1     20
6      2     2     40
7      2     3     NA
8      2     4     NA

结果数据框应如下所示:

game_data_new <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA), firstna = c(0,1,0,0,0,0,1,0))

game_data_new
  player level points firstna
1      1     1     20       0
2      1     2     NA       1
3      1     3     NA       0
4      1     4     NA       0
5      2     1     20       0
6      2     2     40       0
7      2     3     NA       1
8      2     4     NA       0

说实话,我不知道该怎么做。如果有dplyr选项可以做到这一点。

8 个答案:

答案 0 :(得分:8)

基本的R解决方案:

ave(game_data$points, game_data$player,
    FUN = function(x) seq_along(x) == match(NA, x, nomatch = 0))

答案 1 :(得分:5)

另一个ave选项,可按组(NA)首先查找player

game_data$firstna <- ave(game_data$points, game_data$player, 
                         FUN = function(x) cumsum(is.na(x)) == 1)

game_data
#  player level points firstna
#1      1     1     20       0
#2      1     2     NA       1
#3      1     3     NA       0
#4      1     4     NA       0
#5      2     1     20       0
#6      2     2     40       0
#7      2     3     NA       1
#8      2     4     NA       0

答案 2 :(得分:3)

这是data.table的解决方案:

library("data.table")

game_data <- data.table(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))

game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
# > game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
#    player level points firstna
# 1:      1     1     20   FALSE
# 2:      1     2     NA    TRUE
# 3:      1     3     NA   FALSE
# 4:      1     4     NA   FALSE
# 5:      2     1     20   FALSE
# 6:      2     2     40   FALSE
# 7:      2     3     NA    TRUE
# 8:      2     4     NA   FALSE

答案 3 :(得分:2)

library(tidyverse)
library(data.table)

data.frame(
  player = c(1,1,1,1,2,2,2,2),
  level = c(1,2,3,4,1,2,3,4),
  points = c(20,NA,NA,NA,20,40,NA,NA)
) -> game_data

game_data_base1 <- game_data
game_data_dt <- data.table(game_data)

microbenchmark::microbenchmark(

  better_base = game_data$first_na <- ave(
    game_data$points,
    game_data$player,
    FUN=function(x) seq_along(x)==match(NA,x,nomatch=0)
  ),

  brute_base = do.call(
    rbind.data.frame,
    lapply(
      split(game_data, game_data$player),
      function(x) {
        x$firstna <- 0
        na_loc <- which(is.na(x$points))
        if (length(na_loc) > 0) x$firstna[na_loc[1]] <- 1
        x
      }
    )
  ),

  tidy = game_data %>%
    group_by(player) %>%
    mutate(firstna=as.numeric(is.na(points) & !duplicated(points))) %>%
    ungroup(),

  dt = game_data_dt[, firstna:=as.integer(is.na(points) & !is.na(shift(points))), player]

)
## Unit: microseconds
##         expr     min       lq      mean    median        uq        max neval
##  better_base 125.188  156.861  362.9829  191.6385  355.6675   3095.958   100
##   brute_base 366.642  450.002 2782.6621  658.0380 1072.6475 174373.974   100
##         tidy 998.924 1119.022 2528.3687 1509.0705 2516.9350  42406.778   100
##           dt 330.428  421.211 1031.9978  535.8415 1042.1240   9671.991   100

答案 4 :(得分:1)

您可以按照玩家分组,然后进行变异检查以检查某行是否具有NA值,而上一行则没有

game_data %>%
  group_by(player) %>%
  mutate(firstna = ifelse(is.na(points) & lag(!is.na(points)),1,0)) %>%
  ungroup()

结果:

# A tibble: 8 x 4
# Groups:   player [2]
  player level points firstna
   <dbl> <dbl>  <dbl>   <dbl>
1      1     1     20       0
2      1     2     NA       1
3      1     3     NA       0
4      1     4     NA       0
5      2     1     20       0
6      2     2     40       0
7      2     3     NA       1
8      2     4     NA       0

答案 5 :(得分:1)

game_data %>% 
  group_by(player) %>% 
  mutate(firstna=as.numeric(is.na(points) & !duplicated(points)))

按玩家分组,然后为既不适用又不适用于前几行的个案创建布尔向量。

# A tibble: 8 x 4
# Groups:   player [2]
  player level points firstna
   <dbl> <dbl>  <dbl>   <dbl>
1      1     1     20       0
2      1     2     NA       1
3      1     3     NA       0
4      1     4     NA       0
5      2     1     20       0
6      2     2     40       0
7      2     3     NA       1
8      2     4     NA       0

如果您要在NA之前的最后一个非NA行上加上1,请用以下内容替换变异行:

mutate(lastnonNA=as.numeric(!is.na(points) & is.na(lead(points))))

NA块的第一行,一直运行到玩家组的末尾:

game_data %>% 
  group_by(player) %>% 
  mutate(firstna=as.numeric(is.na(points) & !duplicated(cbind(points,cumsum(!is.na(points))))))

答案 6 :(得分:1)

使用base的另一种方法:

game_data$firstna <-
unlist(
tapply(game_data$points, game_data$player, function(x) {i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x})
)

或作为另一个?ave克隆:

ave(game_data$points, game_data$player, FUN = function(x) {
    i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x
})

答案 7 :(得分:0)

使用diff

的选项
transform(game_data, firstna = ave(is.na(points), player, FUN = function(x) c(0,diff(x))))
#   player level points firstna
# 1      1     1     20       0
# 2      1     2     NA       1
# 3      1     3     NA       0
# 4      1     4     NA       0
# 5      2     1     20       0
# 6      2     2     40       0
# 7      2     3     NA       1
# 8      2     4     NA       0

及其等效的dplyr

library(dplyr)
game_data %>% group_by(player) %>% mutate(firstna = c(0,diff(is.na(points))))
# # A tibble: 8 x 4
# # Groups:   player [2]
#   player level points firstna
#    <dbl> <dbl>  <dbl>   <dbl>
# 1      1     1     20       0
# 2      1     2     NA       1
# 3      1     3     NA       0
# 4      1     4     NA       0
# 5      2     1     20       0
# 6      2     2     40       0
# 7      2     3     NA       1
# 8      2     4     NA       0