如何删除符合条件的行和与其相邻的行

时间:2016-11-10 20:20:42

标签: r data.table

我有以下示例数据:

data <- data.table(ID = c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4), 
                 date = c(1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6), 
                score = c(4,3,2,2,4,1,5,5,5,2,1,4,2,1,5,5,5,3,5,5,5,2,4,5))

   ID date score
 1:  1    1     4
 2:  1    2     3
 3:  1    3     2
 4:  1    4     2
 5:  1    5     4
 6:  1    6     1
 7:  2    1     5
 8:  2    2     5
 9:  2    3     5
10:  2    4     2
11:  2    5     1
12:  2    6     4
13:  3    1     2
14:  3    2     1
15:  3    3     5
16:  3    4     5
17:  3    5     5
18:  3    6     3
19:  4    1     5
20:  4    2     5
21:  4    3     5
22:  4    4     2
23:  4    5     4
24:  4    6     5
    ID date score

我希望消除某些行,并根据它们在表中的位置来更改其他行。我有两个标准,每个ID

  1. 如果某行有date == 1score == 5,我想删除该行以及该行后紧跟着score==5的所有后续行,直到{{1不是5.(例如,对于score,我想保留日期4,5,6的数据。

  2. 对于I == 4的所有其他日期,我想将他们的分数替换为前两个分数的平均值(或者只是他们之前的分数,如果他们只有一个先前分数)。

  3. 所以,我想最终得到的表是:

    score == 5

    最好的方法是什么?我想它是 ID date score 1: 1 1 4.0 2: 1 2 3.0 3: 1 3 2.0 4: 1 4 2.0 5: 1 5 4.0 6: 1 6 1.0 7: 2 4 2.0 8: 2 5 1.0 9: 2 6 4.0 10: 3 1 2.0 11: 3 2 1.0 12: 3 3 1.5 13: 3 4 1.5 14: 3 5 1.5 15: 3 6 3.0 16: 4 4 2.0 17: 4 5 4.0 18: 4 6 3.0 shift的某种组合,但我无法把它放在一起。

2 个答案:

答案 0 :(得分:1)

# find rows satisfying 1st condition
torm = data[, if(score[1] == 5 & date[1] == 1) .I
            , by = .(ID, rleid(score), cumsum(date == 1))]$V1

library(caTools) # for running mean

data[-torm    # remove the extra rows
   # add a running mean
   ][, mn := runmean(score, 2, endrule = 'keep', align = 'right'), by = ID
   # compute the new score - a little care needed here in case we only have 5's in a group
   ][, new.score := ifelse(score == 5, mn[which(score != 5)[1]], score)
     , by = .(ID, cumsum(score != 5))][]
#    ID date score  mn new.score
# 1:  1    1     4 4.0       4.0
# 2:  1    2     3 3.5       3.0
# 3:  1    3     2 2.5       2.0
# 4:  1    4     2 2.0       2.0
# 5:  1    5     4 3.0       4.0
# 6:  1    6     1 2.5       1.0
# 7:  2    4     2 2.0       2.0
# 8:  2    5     1 1.5       1.0
# 9:  2    6     4 2.5       4.0
#10:  3    1     2 2.0       2.0
#11:  3    2     1 1.5       1.0
#12:  3    3     5 3.0       1.5
#13:  3    4     5 5.0       1.5
#14:  3    5     5 5.0       1.5
#15:  3    6     3 4.0       3.0
#16:  4    4     2 2.0       2.0
#17:  4    5     4 3.0       4.0
#18:  4    6     5 4.5       3.0

答案 1 :(得分:-1)

来自na.locf包的zoo

library(zoo)

DF <- data.frame(ID = c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4), 
                 date = c(1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6), 
                score = c(4,3,2,2,4,1,5,5,5,2,1,4,2,1,5,5,5,3,5,5,5,2,4,5))



#mark rows for deletion

DF$markForDel=NA

DF$markForDel[DF$date==1 & DF$score==5]=1

DF$markForDel[DF$score!=5]=0

DF$markForDel = zoo::na.locf(DF$markForDel)


newDF = DF[DF$markForDel!=1,]
rownames(newDF)=NULL


#impute mean of previous score where score == 5
newDF$score[newDF$score==5]=NA

newDF$imputedScore = sapply(1:nrow(newDF),function(x)  {
ifelse(x>3 & is.na(newDF$score[x]),mean(c(newDF$score[x-1],newDF$score[x-2]) ),newDF$score[x]) })               


newDF$imputedScore = zoo::na.locf(newDF$imputedScore)

<强>输出:

newDF
#   ID date score markForDel imputedScore
#1   1    1     4          0          4.0
#2   1    2     3          0          3.0
#3   1    3     2          0          2.0
#4   1    4     2          0          2.0
#5   1    5     4          0          4.0
#6   1    6     1          0          1.0
#7   2    4     2          0          2.0
#8   2    5     1          0          1.0
#9   2    6     4          0          4.0
#10  3    1     2          0          2.0
#11  3    2     1          0          1.0
#12  3    3    NA          0          1.5
#13  3    4    NA          0          1.5
#14  3    5    NA          0          1.5
#15  3    6     3          0          3.0
#16  4    4     2          0          2.0
#17  4    5     4          0          4.0
#18  4    6    NA          0          3.0