嘿,我需要填写数据框的缺失值。逻辑很简单,如果M[i, j + 1]
中有值,则使用M[i, j + 1]
,否则使用M[i, j - 1]
。但是棘手的是,我需要填写缺失的值,因为从行的开始到每一行的最后一个非na值之后的列,不仅是非空单元格附近的单元格。
这是数据
a1 <- c('a',9,8,rep(NA,5))
a2 <- c('b',NA,NA,NA,NA,3,NA,4)
a3 <- c('c',11,6,7,NA,NA,NA,6)
M <- rbind(a1,a2,a3)
ind <- !is.na(M[,-1])
t <- tapply(M[,-1][ind], row(M[,-1])[ind], head, 1)
M <- M %>%
as.data.frame(stringsAsFactors = FALSE) %>%
group_by(V1) %>%
do(mutate(., last_non_na_col = max(apply(.,1,function(x) max(which(!is.na(x)))))))
for (i in 1:nrow(M)) {
for (j in 3:(M$last_non_na_col[i]+1)) {
if (is.na(M[i,j])) {
M[i,j] = ifelse(!is.na(M[i,j+1]),M[i,j+1],(ifelse(!is.na(M[i,j-1]),M[i,j-1],t[i])))
} }
for (j in 2) { M[i,j] = ifelse(is.na(M[i,j]), M[i,j+1], M[i,j])}
}
原始数据就是这样
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
a1 "a" "9" "8" NA NA NA NA NA
a2 "b" NA NA NA NA "3" NA "4"
a3 "c" "11" "6" "7" NA NA NA "6"
我的代码输出如下,这是正确的。请注意,对于单元格M [2,5],填充值应为7(它之前的数字),而不是6(它之后的最近数字)。
V1 V2 V3 V4 V5 V6 V7 V8 last_non_na_col
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 a 9 8 8 NA NA NA NA 3
2 b 3 3 3 3 3 4 4 8
3 c 11 6 7 7 7 6 6 8
我在for循环中这样做。有人能帮我在dydyverse中做到这一点吗?
谢谢
凯西
答案 0 :(得分:1)
有了tbl_df
,我们可以使用tidyverse
方法
library(tidyverse)
gather(M, key, val, -V1) %>%
group_by(V1) %>%
fill(val, .direction = 'up') %>%
mutate(val = replace(val, which(is.na(val))[1],
val[tail(which(!is.na(val)), 1)])) %>%
spread(key, val)
# A tibble: 3 x 8
# Groups: V1 [3]
# V1 V2 V3 V4 V5 V6 V7 V8
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 a 9 8 8 NA NA NA NA
#2 b 3 3 3 3 3 4 4
#3 c 11 6 7 5 5 6 6
在OP的for
循环中,我们可以使用na.locf
(用zoo
包中的相邻非NA元素填充NA元素)
library(zoo)
last_non_na_col <- c(3, 8, 8)
for (i in seq_len(nrow(M))) {
M[i, -1] <- na.locf(unlist(M[i, -1]), fromLast = TRUE, na.rm = FALSE)
for (j in 3:(pmin(ncol(M), last_non_na_col[i]+1))) {
if (is.na(M[i,j])) {
M[i,j] = ifelse(!is.na(M[i,j+1]), M[i,j+1], M[i,j-1])
}
}
}
M
# A tibble: 3 x 8
# Groups: V1 [3]
# V1 V2 V3 V4 V5 V6 V7 V8
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 a 9 8 8 NA NA NA NA
#2 b 3 3 3 3 3 4 4
#3 c 11 6 7 5 5 6 6
注意:在这里,我们将last_non_na_col
创建为vector
,而不是在数据集中创建一个单独的列,以简化索引编制
M <- structure(list(V1 = c("a", "b", "c"), V2 = c("9", NA, "11"),
V3 = c("8", NA, "6"), V4 = c(NA, NA, "7"), V5 = c(NA_character_,
NA_character_, NA_character_), V6 = c(NA, "3", "5"), V7 = c(NA_character_,
NA_character_, NA_character_), V8 = c(NA, "4", "6")), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8"), row.names = c(NA,
-3L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"),
vars = "V1", drop = TRUE, indices = list(
0L, 1L, 2L), group_sizes = c(1L, 1L, 1L), biggest_group_size = 1L,
labels = structure(list(
V1 = c("a", "b", "c")), row.names = c(NA, -3L),
class = "data.frame", vars = "V1", drop = TRUE, .Names = "V1"))