假设我有一个类似下面的数据框,我需要识别每个行,其中一个或多个缺失值(NA)后面跟着至少一个有效值(任何数字)。你能救我吗?
a <- c(1, 'S06.4', 6.7, 7.0, 6.5, 7.0, 7.2, NA, NA, 6.6,6.7)
b <- c(2 ,'S06.2' ,5.0, NA, 4.9, 7.8, 9.3, 8.0, 7.8, 8.0,NA)
c <- c(3, 'S06.5', 7.0, 5.5, NA, NA, 7.2, 8.0, 7.6, NA,6.7)
d <- c(4, 'S06.5', 7.0, 7.0, 7.0, 6.9, 6.8, 9.0, 6.0, 6.6,6.7)
e <- c(5, 'S06.1', 6.7, NA, NA, NA, NA, NA, NA, NA,NA)
df <- data.frame(rbind(a,b,c,d,e))
colnames(df) <- c('id','dx','dia01','dia02','dia03','dia04','dia05','dia06','dia07','dia08','dia09')
答案 0 :(得分:7)
使用:
df[rowSums(is.na(df[,3:10]) * !is.na(df[,4:11])) > 0,]
你得到:
id dx dia01 dia02 dia03 dia04 dia05 dia06 dia07 dia08 dia09 a 1 S06.4 6.7 7 6.5 7 7.2 <NA> <NA> 6.6 6.7 b 2 S06.2 5 <NA> 4.9 7.8 9.3 8 7.8 8 <NA> c 3 S06.5 7 5.5 <NA> <NA> 7.2 8 7.6 <NA> 6.7
这是做什么的:
is.na(df[,3:10])
检查dia01
到dia08
列中的哪些值为NA
并返回逻辑矩阵。!is.na(df[,4:11])
对df[,3:10]
的每一行中的下一个值执行相同操作,并返回逻辑矩阵rowSums
检查每行中是否至少满足一次条件。回复您的评论:如果您想确保NA
后跟一个数值,您可以将以上解决方案更改为:
# first convert the 'dia*''-columns to numeric
df[-c(1,2)] <- lapply(df[-c(1,2)], function(x) as.numeric(as.character(x)))
# then do the same because values that can't converted to numeric will give NA
df[rowSums(is.na(df[,3:10]) * !is.na(df[,4:11])) > 0,]
或者首先不转换为数字:
df[rowSums(is.na(df[,3:10]) * !is.na(sapply(df[4:11], function(x) as.numeric(as.character(x))))) > 0,]
注:
使用您用于构建示例数据的方法,您将得到所有因子列。其中我想你不想要那个。
可能格式正确的示例数据集是:
df <- structure(list(id = c("1", "2", "3", "4", "5"),
dx = c("S06.4", "S06.2", "S06.5", "S06.5", "S06.1"),
dia01 = c(6.7, 5, 7, 7, 6.7),
dia02 = c(7, NA, 5.5, 7, NA),
dia03 = c(6.5, 4.9, NA, 7, NA),
dia04 = c(7, 7.8, NA, 6.9, NA),
dia05 = c(7.2, 9.3, 7.2, 6.8, NA),
dia06 = c(NA, 8, 8, 9, NA),
dia07 = c(NA, 7.8, 7.6, 6, NA),
dia08 = c(6.6, 8, NA, 6.6, NA),
dia09 = c(6.7, NA, 6.7, 6.7, NA)),
.Names = c("id", "dx", "dia01", "dia02", "dia03", "dia04", "dia05", "dia06", "dia07", "dia08", "dia09"),
row.names = c("a", "b", "c", "d", "e"),
class = "data.frame")
提出的方法也适用于此。
正如@Frank在评论中所指出的,最好以长格式存储数据。用:
library(data.table)
setDT(df)[, 3:11 := lapply(.SD, function(x) as.numeric(as.character(x))), .SDcols = 3:11][]
melt(df, id = 1:2)[, if(any(is.na(value) & !is.na(shift(value, type = 'lead')))) .SD, by = .(id, dx)]
你得到:
id dx variable value
1: 1 S06.4 dia01 6.7
2: 1 S06.4 dia02 7.0
3: 1 S06.4 dia03 6.5
4: 1 S06.4 dia04 7.0
5: 1 S06.4 dia05 7.2
6: 1 S06.4 dia06 NA
7: 1 S06.4 dia07 NA
8: 1 S06.4 dia08 6.6
9: 1 S06.4 dia09 6.7
10: 2 S06.2 dia01 5.0
11: 2 S06.2 dia02 NA
12: 2 S06.2 dia03 4.9
13: 2 S06.2 dia04 7.8
14: 2 S06.2 dia05 9.3
15: 2 S06.2 dia06 8.0
16: 2 S06.2 dia07 7.8
17: 2 S06.2 dia08 8.0
18: 2 S06.2 dia09 NA
19: 3 S06.5 dia01 7.0
20: 3 S06.5 dia02 5.5
21: 3 S06.5 dia03 NA
22: 3 S06.5 dia04 NA
23: 3 S06.5 dia05 7.2
24: 3 S06.5 dia06 8.0
25: 3 S06.5 dia07 7.6
26: 3 S06.5 dia08 NA
27: 3 S06.5 dia09 6.7
另一种选择是:
setDT(df)[, 3:11 := lapply(.SD, function(x) as.numeric(as.character(x))), .SDcols = 3:11][]
df[unique(melt(df, id = 1:2)[, .I[is.na(value) & !is.na(shift(value, type = 'lead'))], by = .(id, dx)], by = 'id')[,'id'], on = 'id']
然而,这种方法的结果仍然是广泛的格式,如本答复的第一部分所示。
答案 1 :(得分:3)
另一个想法是使用带有apply
的{{1}}来遍历每一行,并将NA的最小指数与非NA的最大指数进行比较,即
1
给出,
#convert to numeric first to capture only valid numbers (as in @Jaap's answer) df[-c(1,2)] <- lapply(df[-c(1,2)], function(x) as.numeric(as.character(x))) apply(d1, 1, function(i) min(which(is.na(i))) < max(which(!is.na(i)))) # a b c d e # TRUE TRUE TRUE FALSE FALSE #or df[apply(d1, 1, function(i) min(which(is.na(i))) < max(which(!is.na(i)))),]
答案 2 :(得分:2)
以下是rle()
的解决方案:
(我使用了Jaap答案中的数据定义)
df <- structure(list(id = c("1", "2", "3", "4", "5"),
dx = c("S06.4", "S06.2", "S06.5", "S06.5", "S06.1"),
dia01 = c(6.7, 5, 7, 7, 6.7),
dia02 = c(7, NA, 5.5, 7, NA),
dia03 = c(6.5, 4.9, NA, 7, NA),
dia04 = c(7, 7.8, NA, 6.9, NA),
dia05 = c(7.2, 9.3, 7.2, 6.8, NA),
dia06 = c(NA, 8, 8, 9, NA),
dia07 = c(NA, 7.8, 7.6, 6, NA),
dia08 = c(6.6, 8, NA, 6.6, NA),
dia09 = c(6.7, NA, 6.7, 6.7, NA)),
.Names = c("id", "dx", "dia01", "dia02", "dia03", "dia04", "dia05", "dia06", "dia07", "dia08", "dia09"),
row.names = c("a", "b", "c", "d", "e"),
class = "data.frame")
R <- apply(is.na(df[-(1:2)]), 1, rle)
id.row <- function(r) {
first.na <- which(r$value)[1]
if (is.na(first.na)) return(FALSE)
if (first.na==length(r$value)) return(FALSE)
return(TRUE)
}
sapply(R, id.row)
#> sapply(R, id.row)
# a b c d e
# TRUE TRUE TRUE FALSE FALSE