我想创建一个新变量,该变量指示对该变量的值的首次特定观察。
在下面的示例数据集中,我希望有一个新变量“ firstna”,对于该玩家的第一次“ NA”观察值为“ 1”。
game_data <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))
game_data
player level points
1 1 1 20
2 1 2 NA
3 1 3 NA
4 1 4 NA
5 2 1 20
6 2 2 40
7 2 3 NA
8 2 4 NA
结果数据框应如下所示:
game_data_new <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA), firstna = c(0,1,0,0,0,0,1,0))
game_data_new
player level points firstna
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
说实话,我不知道该怎么做。如果有dplyr选项可以做到这一点。
答案 0 :(得分:8)
基本的R
解决方案:
ave(game_data$points, game_data$player,
FUN = function(x) seq_along(x) == match(NA, x, nomatch = 0))
答案 1 :(得分:5)
另一个ave
选项,可按组(NA
)首先查找player
。
game_data$firstna <- ave(game_data$points, game_data$player,
FUN = function(x) cumsum(is.na(x)) == 1)
game_data
# player level points firstna
#1 1 1 20 0
#2 1 2 NA 1
#3 1 3 NA 0
#4 1 4 NA 0
#5 2 1 20 0
#6 2 2 40 0
#7 2 3 NA 1
#8 2 4 NA 0
答案 2 :(得分:3)
这是data.table
的解决方案:
library("data.table")
game_data <- data.table(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))
game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
# > game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
# player level points firstna
# 1: 1 1 20 FALSE
# 2: 1 2 NA TRUE
# 3: 1 3 NA FALSE
# 4: 1 4 NA FALSE
# 5: 2 1 20 FALSE
# 6: 2 2 40 FALSE
# 7: 2 3 NA TRUE
# 8: 2 4 NA FALSE
答案 3 :(得分:2)
library(tidyverse)
library(data.table)
data.frame(
player = c(1,1,1,1,2,2,2,2),
level = c(1,2,3,4,1,2,3,4),
points = c(20,NA,NA,NA,20,40,NA,NA)
) -> game_data
game_data_base1 <- game_data
game_data_dt <- data.table(game_data)
microbenchmark::microbenchmark(
better_base = game_data$first_na <- ave(
game_data$points,
game_data$player,
FUN=function(x) seq_along(x)==match(NA,x,nomatch=0)
),
brute_base = do.call(
rbind.data.frame,
lapply(
split(game_data, game_data$player),
function(x) {
x$firstna <- 0
na_loc <- which(is.na(x$points))
if (length(na_loc) > 0) x$firstna[na_loc[1]] <- 1
x
}
)
),
tidy = game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(points))) %>%
ungroup(),
dt = game_data_dt[, firstna:=as.integer(is.na(points) & !is.na(shift(points))), player]
)
## Unit: microseconds
## expr min lq mean median uq max neval
## better_base 125.188 156.861 362.9829 191.6385 355.6675 3095.958 100
## brute_base 366.642 450.002 2782.6621 658.0380 1072.6475 174373.974 100
## tidy 998.924 1119.022 2528.3687 1509.0705 2516.9350 42406.778 100
## dt 330.428 421.211 1031.9978 535.8415 1042.1240 9671.991 100
答案 4 :(得分:1)
您可以按照玩家分组,然后进行变异检查以检查某行是否具有NA值,而上一行则没有
game_data %>%
group_by(player) %>%
mutate(firstna = ifelse(is.na(points) & lag(!is.na(points)),1,0)) %>%
ungroup()
结果:
# A tibble: 8 x 4
# Groups: player [2]
player level points firstna
<dbl> <dbl> <dbl> <dbl>
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
答案 5 :(得分:1)
game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(points)))
按玩家分组,然后为既不适用又不适用于前几行的个案创建布尔向量。
# A tibble: 8 x 4
# Groups: player [2]
player level points firstna
<dbl> <dbl> <dbl> <dbl>
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
如果您要在NA之前的最后一个非NA行上加上1,请用以下内容替换变异行:
mutate(lastnonNA=as.numeric(!is.na(points) & is.na(lead(points))))
NA块的第一行,一直运行到玩家组的末尾:
game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(cbind(points,cumsum(!is.na(points))))))
答案 6 :(得分:1)
使用base的另一种方法:
game_data$firstna <-
unlist(
tapply(game_data$points, game_data$player, function(x) {i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x})
)
或作为另一个?ave
克隆:
ave(game_data$points, game_data$player, FUN = function(x) {
i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x
})
答案 7 :(得分:0)
使用diff
transform(game_data, firstna = ave(is.na(points), player, FUN = function(x) c(0,diff(x))))
# player level points firstna
# 1 1 1 20 0
# 2 1 2 NA 1
# 3 1 3 NA 0
# 4 1 4 NA 0
# 5 2 1 20 0
# 6 2 2 40 0
# 7 2 3 NA 1
# 8 2 4 NA 0
及其等效的dplyr
:
library(dplyr)
game_data %>% group_by(player) %>% mutate(firstna = c(0,diff(is.na(points))))
# # A tibble: 8 x 4
# # Groups: player [2]
# player level points firstna
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 20 0
# 2 1 2 NA 1
# 3 1 3 NA 0
# 4 1 4 NA 0
# 5 2 1 20 0
# 6 2 2 40 0
# 7 2 3 NA 1
# 8 2 4 NA 0