Question

以下是我的数据框df，它有3个变量和大约100k个数据点，

   ID   A   B   C
   1    35  0   0
   2    28  0   0
   3    36  0   0
   4    99  0   0 
   5    25  1   0
   6    65  1   0
   7    98  1   0
   8    95  1   0
   9    67  0   65
  10    95  0   65 
  11    94  1   65
  12    4   0   65
  13    2   -1  0
  14    62  -1  0
  15    95  -1  0
  16    25  -1  0
  17    36  0   19.5
  18    3   0   19.5
  20    68  1   19.5     
  21    17  0   0

现在，我想创建另一个变量D，其中包含A的最后一个数据点B == 1 or -1与C中的下一个值之间的差异B == 1 or -1。

另一个条件是，如果列C中的值对于接下来的4个值是相同的，那么程序不应该考虑减去值A和C例如，{ {1}}它的值更改为1，但程序不应该考虑该数据指向B11 ==1之间的差异，因为A11 and C12的值在C时仍然在4的计数内} 已经改变。同样适用于B的数据点。

预期输出

B20

Answer 1

您应该可以执行以下操作：

条件1：

# create a list of instances where B == 1:
B_one <- which(x$B == 1)

# fill the D column with 0
x$D <- rep(0, nrow(x))  

# In cases where B == 1 then overwrite with the difference between A_n and C_n+1
for (i in c(1:length(B_one))){
    x[B_one[i], "D"] <- x[B_one[i], "A"] - x[B_on[(i+1)], "C"]  
}

条件2（我可能误解了这一点）：

# Create a list which counts how many times C has repeated
counter <- sequence(rle(as.character(x$C))$lengths)

# In cases where this is <= 4 then overwrite again
x[counter <= 4, "D"] <- 0

Answer 2

这是我的解决方案。如果您愿意，可以折叠几个步骤：

library(data.table)
mydt <- data.table(structure(list(ID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
                                         11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 20L, 21L), A = c(35L, 
                                                                                                  28L, 36L, 99L, 25L, 65L, 98L, 95L, 67L, 95L, 94L, 4L, 2L, 62L, 
                                                                                                  95L, 25L, 36L, 3L, 68L, 17L), B = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 
                                                                                                                                      1L, 0L, 0L, 1L, 0L, -1L, -1L, -1L, -1L, 0L, 0L, 1L, 0L), C = c(0, 
                                                                                                                                                                                                     0, 0, 0, 0, 0, 0, 0, 65, 65, 65, 65, 0, 0, 0, 0, 19.5, 19.5, 
                                                                                                                                                                                                     19.5, 0)), .Names = c("ID", "A", "B", "C"), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                     -20L)))

getGroups <- function(x){
  grouplengths <- rle(x)$lengths
  return(rep(1:length(grouplengths),grouplengths))
}

getLastValue <- function(x){
  return(c(rep(0,length(x)-1),x[length(x)]))
}

mydt[,Bgroup:=getGroups(B)]
mydt[B %in% c(1,-1),Bpresent:=TRUE]
mydt[is.na(Bpresent),Bpresent:=FALSE]
mydt[Bpresent==TRUE,lastA:=A[.N],by=Bgroup]
mydt[Bpresent==TRUE,firstC:=ifelse(sum(Bpresent)<4,C[1],0),by=Bgroup]
mydt[Bpresent==TRUE,difference:=lastA-shift(firstC,type="lead")]
mydt[is.na(difference)|(lastA==difference),difference:=0]
mydt[,c("Bgroup","Bpresent","lastA","firstC"):=NULL]
mydt
# ID  A  B    C difference
# 1:  1 35  0  0.0        0.0
# 2:  2 28  0  0.0        0.0
# 3:  3 36  0  0.0        0.0
# 4:  4 99  0  0.0        0.0
# 5:  5 25  1  0.0        0.0
# 6:  6 65  1  0.0        0.0
# 7:  7 98  1  0.0        0.0
# 8:  8 95  1  0.0       30.0
# 9:  9 67  0 65.0        0.0
# 10: 10 95  0 65.0        0.0
# 11: 11 94  1 65.0        0.0
# 12: 12  4  0 65.0        0.0
# 13: 13  2 -1  0.0        0.0
# 14: 14 62 -1  0.0        0.0
# 15: 15 95 -1  0.0        0.0
# 16: 16 25 -1  0.0        5.5
# 17: 17 36  0 19.5        0.0
# 18: 18  3  0 19.5        0.0
# 19: 20 68  1 19.5        0.0
# 20: 21 17  0  0.0        0.0

使用条件

2 个答案: