我有一个非常大的数据框,我想删除一列id的值之间的行,但只有当它们在这个值内,而不是在开头或结尾时。在示例中,我想删除行或=' base'在行之间或='计划'
id <- c(1,1,1,1,1,1,2,2,2,2,2,2)
fd <- c(101,102,103,104,105,106,101,102,103,104,105,106)
rem <- c(100,120,120,140, 140, 150, 200,220,220,250, 300, 310)
or <- c("base", "base", "plan", "base", "plan", "base", "plan", "base",
"plan", "base", "plan", "base")
df <- data.frame(id, fd, rem, or)
结果:
id1 <- c(rep(1,5), rep(2,4))
fd1 <- c(101,102,103,106, 107, 101,103,105,106)
or1 <- c("base", "base", "plan", "plan", "base", "plan", "plan", "plan", "base")
df1 <- data.frame(id1,fd1,or1)
答案 0 :(得分:4)
两种可能的解决方案:
1)使用基础R:
idx <- ave(df$or, df$id, FUN = function(x) x=='base' & c('base',head(x,-1))=='plan' & c(tail(x,-1),'base')=='plan')=='FALSE'
df[idx,]
给出:
id fd rem or
1 1 101 100 base
2 1 102 120 base
3 1 103 120 plan
5 1 105 140 plan
6 1 106 150 base
7 2 101 200 plan
9 2 103 220 plan
11 2 105 300 plan
12 2 106 310 base
2)使用data.table
- 包:
library(data.table)
setDT(df)
idx <- df[, .I[!(or=='base' & shift(or, fill = 'base')=='plan' & shift(or, fill = 'base', type = 'lead')=='plan')], id]$V1
df[idx]
给出:
id fd rem or
1: 1 101 100 base
2: 1 102 120 base
3: 1 103 120 plan
4: 1 105 140 plan
5: 1 106 150 base
6: 2 101 200 plan
7: 2 103 220 plan
8: 2 105 300 plan
9: 2 106 310 base
或者一气呵成:
library(data.table)
setDT(df)[df[, .I[!(or=='base' & shift(or, fill = 'base')=='plan' & shift(or, fill = 'base', type = 'lead')=='plan')], id]$V1]
在回复评论时,您可以使用rle
- 函数来检测多个'base'
- 'plan'
- 行之间的行,如下所示(在基数R中):
# create new example dataset
df2 <- df[c(1:3,4,4,5:7,8,8,9:12),]
# the new example dataset:
> df2
id fd rem or
1 1 101 100 base
2 1 102 120 base
3 1 103 120 plan
4 1 104 140 base
4.1 1 104 140 base
5 1 105 140 plan
6 1 106 150 base
7 2 101 200 plan
8 2 102 220 base
8.1 2 102 220 base
9 2 103 220 plan
10 2 104 250 base
11 2 105 300 plan
12 2 106 310 base
# define function
f <- function(x) {
rl <- rle(x)
rl$values <- !(rl$values == 'base' & c('base',head(rl$values,-1))=='plan' & c(tail(rl$values,-1),'base')=='plan')
inverse.rle(rl)
}
# apply the function to each id-group and create an index
idx2 <- as.logical(ave(df2$or, df2$id, FUN = f))
# finally subset your data with the logical-index
df2[idx2,]
给出:
> df2[idx2,]
id fd rem or
1 1 101 100 base
2 1 102 120 base
3 1 103 120 plan
5 1 105 140 plan
6 1 106 150 base
7 2 101 200 plan
9 2 103 220 plan
11 2 105 300 plan
12 2 106 310 base
基础R中的另一个选项(受评论中@ Frank&data; data.table建议的启发):
f2 <- function(x) {
i <- seq_along(x)
w <- which(x == 'plan')
b <- which(x == 'base')
ib <- b[b > head(w,1) & b < tail(w,1)]
!(i %in% ib)
}
idx3 <- unlist(by(df2$or, df2$id, f2))
df2[idx3,]
使用data.table
,您可以关注@ Frank的建议:
setDT(df2)
df2[, keep := {isp = or == "plan"; wp = which(isp); r = 1:.N; isp | r < first(wp) | r > last(wp)}, by = id
][!!keep]
使用过的数据:
df <- structure(list(id = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
fd = c(101, 102, 103, 104, 105, 106, 101, 102, 103, 104, 105, 106),
rem = c(100, 120, 120, 140, 140, 150, 200, 220, 220, 250, 300, 310),
or = c("base", "base", "plan", "base", "plan", "base", "plan", "base", "plan", "base", "plan", "base")),
.Names = c("id", "fd", "rem", "or"), row.names = c(NA, -12L), class = "data.frame")