我有一个带有列距离和计数的数据帧,以及一个带有“T”的列,用于距离> 1的情况和距离<1的情况下的“F”。我想添加一个列,该列为距离> 1的所有行提供唯一的数字(从1开始),每次有一个单独的距离块> 1(在距离下降<1的时间段之间) 。例如,第一次距离> 1然后我期望的结果列中的值将是1,直到距离下降<1,而当返回“NA”时,则当距离再次增加> 1时,期望的值将是2等等!请参阅以下示例和所需的输出。
> dput(df)
structure(list(Count = c(45L, 76L, 23L, 2L, 67L, 34L, 103L, 34L,
88L, 45L, 76L, 23L, 11L, 56L, 22L, 23L, 45L, 66L, 7L, 8L, 2L,
34L, 31L, 45L, 77L, 2L, 34L, 11L, 10L, 2L, 45L, 67L, 56L, 67L,
44L, 54L, 23L, 34L, 22L, 12L, 34L, 22L, 1L, 12L, 12L), Distance = c(0.0066,
0.417, 0.894, 1.289, 1.7645, 2.177, 3.7475, 4.2306, 4.7326, 5.2144,
6.5687, 6.9534, 7.0745, 7.1011, 6.543, 5.0043, 5.0001, 4.7842,
3.22, 2.0045, 1.765, 0.2875, 1e-04, 0.0069, 0.0756, 0.7666, 1.0898,
2.2927, 2.9022, 3.8636, 4.0495, 3.318, 2.1792, 1.7837, 0.7964,
0.0062, 0.0066, 1.567, 2.8854, 3.5679, 2.5784, 1.1111, 0.7853,
0.2, 5e-04), OnTrip = c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE,
TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE)), .Names = c("Count",
"Distance", "OnTrip"), class = "data.frame", row.names = c(NA,
-45L))
期望的输出:
Count Distance OnTrip DesiredResult CumDist
1 45 0.0066 FALSE NA NA
2 76 0.4170 FALSE NA NA
3 23 0.8940 FALSE NA NA
4 2 1.2890 TRUE 1 1.2890
5 67 1.7645 TRUE 1 3.0535
6 34 2.1770 TRUE 1 4.8180
7 103 3.7475 TRUE 1 6.9950
8 34 4.2306 TRUE 1 10.7425
9 88 4.7326 TRUE 1 15.4751
10 45 5.2144 TRUE 1 20.2077
11 76 6.5687 TRUE 1 25.4221
12 23 6.9534 TRUE 1 31.9908
13 11 7.0745 TRUE 1 39.0653
14 56 7.1011 TRUE 1 46.1399
15 22 6.5430 TRUE 1 53.2410
16 23 5.0043 TRUE 1 59.7840
17 45 5.0001 TRUE 1 64.7841
18 66 4.7842 TRUE 1 69.7842
19 7 3.2200 TRUE 1 74.5684
20 8 2.0045 TRUE 1 77.7884
21 2 1.7650 TRUE 1 79.5534
22 34 0.2875 FALSE NA NA
23 31 0.0001 FALSE NA NA
24 45 0.0069 FALSE NA NA
25 77 0.0756 FALSE NA NA
26 2 0.7666 FALSE NA NA
27 34 1.0898 TRUE 2 1.0898
28 11 2.2927 TRUE 2 3.3825
29 10 2.9022 TRUE 2 5.6751
30 2 3.8636 TRUE 2 9.5387
31 45 4.0495 TRUE 2 13.4023
32 67 3.3180 TRUE 2 16.7203
33 56 2.1792 TRUE 2 20.0383
34 67 1.7837 TRUE 2 21.8221
35 44 0.7964 FALSE NA NA
36 54 0.0062 FALSE NA NA
37 23 0.0066 FALSE NA NA
38 34 1.5670 TRUE 3 1.5670
39 22 2.8854 TRUE 3 4.4524
40 12 3.5679 TRUE 3 7.3379
41 34 2.5784 TRUE 3 9.9163
42 22 1.1111 TRUE 3 12.4947
43 1 0.7853 FALSE NA NA
44 12 0.2000 FALSE NA NA
45 12 0.0005 FALSE NA NA
一旦我有了所需的结果列,我就想使用cumsum(df$DesiredResult)
函数为累积距离添加第二列,但是嵌套在距离> 1的块中(请参阅所需的输出)。
我真的很难为此编写规则,所以任何帮助都会非常感激。谢谢!
答案 0 :(得分:3)
如果您数据的前几行始终包含Distance < 1
,则可以rleid(OnTrip==TRUE)/2
使用data.table
生成id序列,通过该序列我们可以计算累积总和:
library(data.table)
setDT(df)[,DesiredResult:=rleid(OnTrip==TRUE)/2][
OnTrip==FALSE, DesiredResult:=NA][
OnTrip==TRUE, CumDist:=cumsum(Distance),by = "DesiredResult"]
> df
# Count Distance OnTrip desired DesiredResult CumDist
#1: 45 0.0066 FALSE 1 NA NA
#2: 76 0.4170 FALSE 2 NA NA
#3: 23 0.8940 FALSE 3 NA NA
#4: 2 1.2890 TRUE 4 1 1.2890
#5: 67 1.7645 TRUE 5 1 3.0535
#6: 34 2.1770 TRUE 6 1 5.2305
#7: 103 3.7475 TRUE 7 1 8.9780
#8: 34 4.2306 TRUE 8 1 13.2086
#9: 88 4.7326 TRUE 9 1 17.9412
#10: 45 5.2144 TRUE 10 1 23.1556
#11: 76 6.5687 TRUE 11 1 29.7243
#12: 23 6.9534 TRUE 12 1 36.6777
#13: 11 7.0745 TRUE 13 1 43.7522
#14: 56 7.1011 TRUE 14 1 50.8533
#15: 22 6.5430 TRUE 15 1 57.3963
#16: 23 5.0043 TRUE 16 1 62.4006
#17: 45 5.0001 TRUE 17 1 67.4007
#18: 66 4.7842 TRUE 18 1 72.1849
#19: 7 3.2200 TRUE 19 1 75.4049
#20: 8 2.0045 TRUE 20 1 77.4094
#21: 2 1.7650 TRUE 21 1 79.1744
#22: 34 0.2875 FALSE 22 NA NA
#23: 31 0.0001 FALSE 23 NA NA
#24: 45 0.0069 FALSE 24 NA NA
#25: 77 0.0756 FALSE 25 NA NA
#26: 2 0.7666 FALSE 26 NA NA
#27: 34 1.0898 TRUE 27 2 1.0898
#28: 11 2.2927 TRUE 28 2 3.3825
#29: 10 2.9022 TRUE 29 2 6.2847
#30: 2 3.8636 TRUE 30 2 10.1483
#31: 45 4.0495 TRUE 31 2 14.1978
#32: 67 3.3180 TRUE 32 2 17.5158
#33: 56 2.1792 TRUE 33 2 19.6950
#34: 67 1.7837 TRUE 34 2 21.4787
#35: 44 0.7964 FALSE 35 NA NA
#36: 54 0.0062 FALSE 36 NA NA
#37: 23 0.0066 FALSE 37 NA NA
#38: 34 1.5670 TRUE 38 3 1.5670
#39: 22 2.8854 TRUE 39 3 4.4524
#40: 12 3.5679 TRUE 40 3 8.0203
#41: 34 2.5784 TRUE 41 3 10.5987
#42: 22 1.1111 TRUE 42 3 11.7098
#43: 1 0.7853 FALSE 43 NA NA
#44: 12 0.2000 FALSE 44 NA NA
#45: 12 0.0005 FALSE 45 NA NA
答案 1 :(得分:1)
以下是基础R中的替代方案,它使用rle
和ifelse
来获取第一个变量,使用ave
和is.na<-
来构建第二个变量。
# get cumulative sum and NA combination by OnTrip
temp <- rle(df$OnTrip)
temp$values <- ifelse(temp$values, cumsum(temp$values), NA)
# add desired result variable
df$DesiredResult <- inverse.rle(temp)
# get cumulative distance
df$CumDist <- ave(df$Distance, df$DesiredResult, FUN=cumsum)
is.na(df$CumDist) <- is.na(df$DesiredResult)
返回
head(df)
Count Distance OnTrip DesiredResult CumDist
1 45 0.0066 FALSE NA NA
2 76 0.4170 FALSE NA NA
3 23 0.8940 FALSE NA NA
4 2 1.2890 TRUE 1 1.2890
5 67 1.7645 TRUE 1 3.0535
6 34 2.1770 TRUE 1 5.2305
请注意,最后两行可以折叠成一行,再次使用ifelse
df$CumDist <- ifelse(df$OnTrip, ave(df$Distance, df$DesiredResult, FUN=cumsum), NA)
我怀疑使用ifelse
和ave
对大型data.frames更有效,但ifelse
方法对于这些数据来说似乎更快一点。 (两个衬垫的中位数为165ms,ifelse
方法的中位数为153ms。