我有以下示例数据:
data <- data.table(ID = c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4),
date = c(1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6),
score = c(4,3,2,2,4,1,5,5,5,2,1,4,2,1,5,5,5,3,5,5,5,2,4,5))
ID date score
1: 1 1 4
2: 1 2 3
3: 1 3 2
4: 1 4 2
5: 1 5 4
6: 1 6 1
7: 2 1 5
8: 2 2 5
9: 2 3 5
10: 2 4 2
11: 2 5 1
12: 2 6 4
13: 3 1 2
14: 3 2 1
15: 3 3 5
16: 3 4 5
17: 3 5 5
18: 3 6 3
19: 4 1 5
20: 4 2 5
21: 4 3 5
22: 4 4 2
23: 4 5 4
24: 4 6 5
ID date score
我希望消除某些行,并根据它们在表中的位置来更改其他行。我有两个标准,每个ID
:
如果某行有date == 1
和score == 5
,我想删除该行以及该行后紧跟着score==5
的所有后续行,直到{{1不是5.(例如,对于score
,我想保留日期4,5,6的数据。
对于I == 4
的所有其他日期,我想将他们的分数替换为前两个分数的平均值(或者只是他们之前的分数,如果他们只有一个先前分数)。
所以,我想最终得到的表是:
score == 5
最好的方法是什么?我想它是 ID date score
1: 1 1 4.0
2: 1 2 3.0
3: 1 3 2.0
4: 1 4 2.0
5: 1 5 4.0
6: 1 6 1.0
7: 2 4 2.0
8: 2 5 1.0
9: 2 6 4.0
10: 3 1 2.0
11: 3 2 1.0
12: 3 3 1.5
13: 3 4 1.5
14: 3 5 1.5
15: 3 6 3.0
16: 4 4 2.0
17: 4 5 4.0
18: 4 6 3.0
和shift
的某种组合,但我无法把它放在一起。
答案 0 :(得分:1)
# find rows satisfying 1st condition
torm = data[, if(score[1] == 5 & date[1] == 1) .I
, by = .(ID, rleid(score), cumsum(date == 1))]$V1
library(caTools) # for running mean
data[-torm # remove the extra rows
# add a running mean
][, mn := runmean(score, 2, endrule = 'keep', align = 'right'), by = ID
# compute the new score - a little care needed here in case we only have 5's in a group
][, new.score := ifelse(score == 5, mn[which(score != 5)[1]], score)
, by = .(ID, cumsum(score != 5))][]
# ID date score mn new.score
# 1: 1 1 4 4.0 4.0
# 2: 1 2 3 3.5 3.0
# 3: 1 3 2 2.5 2.0
# 4: 1 4 2 2.0 2.0
# 5: 1 5 4 3.0 4.0
# 6: 1 6 1 2.5 1.0
# 7: 2 4 2 2.0 2.0
# 8: 2 5 1 1.5 1.0
# 9: 2 6 4 2.5 4.0
#10: 3 1 2 2.0 2.0
#11: 3 2 1 1.5 1.0
#12: 3 3 5 3.0 1.5
#13: 3 4 5 5.0 1.5
#14: 3 5 5 5.0 1.5
#15: 3 6 3 4.0 3.0
#16: 4 4 2 2.0 2.0
#17: 4 5 4 3.0 4.0
#18: 4 6 5 4.5 3.0
答案 1 :(得分:-1)
来自na.locf
包的zoo
:
library(zoo)
DF <- data.frame(ID = c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4),
date = c(1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6),
score = c(4,3,2,2,4,1,5,5,5,2,1,4,2,1,5,5,5,3,5,5,5,2,4,5))
#mark rows for deletion
DF$markForDel=NA
DF$markForDel[DF$date==1 & DF$score==5]=1
DF$markForDel[DF$score!=5]=0
DF$markForDel = zoo::na.locf(DF$markForDel)
newDF = DF[DF$markForDel!=1,]
rownames(newDF)=NULL
#impute mean of previous score where score == 5
newDF$score[newDF$score==5]=NA
newDF$imputedScore = sapply(1:nrow(newDF),function(x) {
ifelse(x>3 & is.na(newDF$score[x]),mean(c(newDF$score[x-1],newDF$score[x-2]) ),newDF$score[x]) })
newDF$imputedScore = zoo::na.locf(newDF$imputedScore)
<强>输出:强>
newDF
# ID date score markForDel imputedScore
#1 1 1 4 0 4.0
#2 1 2 3 0 3.0
#3 1 3 2 0 2.0
#4 1 4 2 0 2.0
#5 1 5 4 0 4.0
#6 1 6 1 0 1.0
#7 2 4 2 0 2.0
#8 2 5 1 0 1.0
#9 2 6 4 0 4.0
#10 3 1 2 0 2.0
#11 3 2 1 0 1.0
#12 3 3 NA 0 1.5
#13 3 4 NA 0 1.5
#14 3 5 NA 0 1.5
#15 3 6 3 0 3.0
#16 4 4 2 0 2.0
#17 4 5 4 0 4.0
#18 4 6 NA 0 3.0