Question

我有一个非常大的数据框，我想删除一列id的值之间的行，但只有当它们在这个值内，而不是在开头或结尾时。在示例中，我想删除行或=＆＃39; base＆＃39;在行之间或=＆＃39;计划＆＃39;

id <- c(1,1,1,1,1,1,2,2,2,2,2,2)
fd <- c(101,102,103,104,105,106,101,102,103,104,105,106)
rem <- c(100,120,120,140, 140, 150, 200,220,220,250, 300, 310)
or <- c("base", "base", "plan", "base", "plan", "base", "plan", "base", 
"plan", "base", "plan", "base")
df <- data.frame(id, fd, rem, or)

结果：

id1 <- c(rep(1,5), rep(2,4))
fd1 <- c(101,102,103,106, 107, 101,103,105,106)
or1 <- c("base", "base", "plan", "plan", "base", "plan", "plan", "plan", "base")

df1 <- data.frame(id1,fd1,or1)

Answer 1

两种可能的解决方案：

1）使用基础R：

idx <- ave(df$or, df$id, FUN = function(x) x=='base' & c('base',head(x,-1))=='plan' & c(tail(x,-1),'base')=='plan')=='FALSE'
df[idx,]

给出：

   id  fd rem   or
1   1 101 100 base
2   1 102 120 base
3   1 103 120 plan
5   1 105 140 plan
6   1 106 150 base
7   2 101 200 plan
9   2 103 220 plan
11  2 105 300 plan
12  2 106 310 base

2）使用data.table - 包：

library(data.table)
setDT(df)

idx <- df[, .I[!(or=='base' & shift(or, fill = 'base')=='plan' & shift(or, fill = 'base', type = 'lead')=='plan')], id]$V1
df[idx]

给出：

   id  fd rem   or
1:  1 101 100 base
2:  1 102 120 base
3:  1 103 120 plan
4:  1 105 140 plan
5:  1 106 150 base
6:  2 101 200 plan
7:  2 103 220 plan
8:  2 105 300 plan
9:  2 106 310 base

或者一气呵成：

library(data.table)
setDT(df)[df[, .I[!(or=='base' & shift(or, fill = 'base')=='plan' & shift(or, fill = 'base', type = 'lead')=='plan')], id]$V1]

在回复评论时，您可以使用rle - 函数来检测多个'base' - 'plan' - 行之间的行，如下所示（在基数R中）：

# create new example dataset
df2 <- df[c(1:3,4,4,5:7,8,8,9:12),]

# the new example dataset:
> df2
    id  fd rem   or
1    1 101 100 base
2    1 102 120 base
3    1 103 120 plan
4    1 104 140 base
4.1  1 104 140 base
5    1 105 140 plan
6    1 106 150 base
7    2 101 200 plan
8    2 102 220 base
8.1  2 102 220 base
9    2 103 220 plan
10   2 104 250 base
11   2 105 300 plan
12   2 106 310 base

# define function
f <- function(x) {
  rl <- rle(x)
  rl$values <- !(rl$values == 'base' & c('base',head(rl$values,-1))=='plan' & c(tail(rl$values,-1),'base')=='plan')
  inverse.rle(rl)
}

# apply the function to each id-group and create an index
idx2 <- as.logical(ave(df2$or, df2$id, FUN = f))

# finally subset your data with the logical-index
df2[idx2,]

给出：

> df2[idx2,]
   id  fd rem   or
1   1 101 100 base
2   1 102 120 base
3   1 103 120 plan
5   1 105 140 plan
6   1 106 150 base
7   2 101 200 plan
9   2 103 220 plan
11  2 105 300 plan
12  2 106 310 base

基础R中的另一个选项（受评论中@ Frank＆data; data.table建议的启发）：

f2 <- function(x) {
  i <- seq_along(x)
  w <- which(x == 'plan')
  b <- which(x == 'base')
  ib <- b[b > head(w,1) & b < tail(w,1)]
  !(i %in% ib)
}

idx3 <- unlist(by(df2$or, df2$id, f2))
df2[idx3,]

使用data.table，您可以关注@ Frank的建议：

setDT(df2)
df2[, keep := {isp = or == "plan"; wp = which(isp); r = 1:.N; isp | r < first(wp) | r > last(wp)}, by = id
    ][!!keep]

使用过的数据：

df <- structure(list(id = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2), 
                     fd = c(101, 102, 103, 104, 105, 106, 101, 102, 103, 104, 105, 106), 
                     rem = c(100, 120, 120, 140, 140, 150, 200, 220, 220, 250, 300, 310), 
                     or = c("base", "base", "plan", "base", "plan", "base", "plan", "base", "plan", "base", "plan", "base")), 
                .Names = c("id", "fd", "rem", "or"), row.names = c(NA, -12L), class = "data.frame")

删除列值之间的行

1 个答案: