如何根据R中另一列中的值向数据帧添加序列号块

时间:2017-06-30 08:38:13

标签: r subset rule cumsum

我有一个带有列距离和计数的数据帧,以及一个带有“T”的列,用于距离> 1的情况和距离<1的情况下的“F”。我想添加一个列,该列为距离> 1的所有行提供唯一的数字(从1开始),每次有一个单独的距离块> 1(在距离下降<1的时间段之间) 。例如,第一次距离> 1然后我期望的结果列中的值将是1,直到距离下降<1,而当返回“NA”时,则当距离再次增加> 1时,期望的值将是2等等!请参阅以下示例和所需的输出。

> dput(df)
structure(list(Count = c(45L, 76L, 23L, 2L, 67L, 34L, 103L, 34L, 
88L, 45L, 76L, 23L, 11L, 56L, 22L, 23L, 45L, 66L, 7L, 8L, 2L, 
34L, 31L, 45L, 77L, 2L, 34L, 11L, 10L, 2L, 45L, 67L, 56L, 67L, 
44L, 54L, 23L, 34L, 22L, 12L, 34L, 22L, 1L, 12L, 12L), Distance = c(0.0066, 
0.417, 0.894, 1.289, 1.7645, 2.177, 3.7475, 4.2306, 4.7326, 5.2144, 
6.5687, 6.9534, 7.0745, 7.1011, 6.543, 5.0043, 5.0001, 4.7842, 
3.22, 2.0045, 1.765, 0.2875, 1e-04, 0.0069, 0.0756, 0.7666, 1.0898, 
2.2927, 2.9022, 3.8636, 4.0495, 3.318, 2.1792, 1.7837, 0.7964, 
0.0062, 0.0066, 1.567, 2.8854, 3.5679, 2.5784, 1.1111, 0.7853, 
0.2, 5e-04), OnTrip = c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, 
TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE)), .Names = c("Count", 
"Distance", "OnTrip"), class = "data.frame", row.names = c(NA, 
-45L))

期望的输出:

   Count Distance OnTrip DesiredResult CumDist
1     45   0.0066  FALSE            NA      NA
2     76   0.4170  FALSE            NA      NA
3     23   0.8940  FALSE            NA      NA
4      2   1.2890   TRUE             1  1.2890
5     67   1.7645   TRUE             1  3.0535
6     34   2.1770   TRUE             1  4.8180
7    103   3.7475   TRUE             1  6.9950
8     34   4.2306   TRUE             1 10.7425
9     88   4.7326   TRUE             1 15.4751
10    45   5.2144   TRUE             1 20.2077
11    76   6.5687   TRUE             1 25.4221
12    23   6.9534   TRUE             1 31.9908
13    11   7.0745   TRUE             1 39.0653
14    56   7.1011   TRUE             1 46.1399
15    22   6.5430   TRUE             1 53.2410
16    23   5.0043   TRUE             1 59.7840
17    45   5.0001   TRUE             1 64.7841
18    66   4.7842   TRUE             1 69.7842
19     7   3.2200   TRUE             1 74.5684
20     8   2.0045   TRUE             1 77.7884
21     2   1.7650   TRUE             1 79.5534
22    34   0.2875  FALSE            NA      NA
23    31   0.0001  FALSE            NA      NA
24    45   0.0069  FALSE            NA      NA
25    77   0.0756  FALSE            NA      NA
26     2   0.7666  FALSE            NA      NA
27    34   1.0898   TRUE             2  1.0898
28    11   2.2927   TRUE             2  3.3825
29    10   2.9022   TRUE             2  5.6751
30     2   3.8636   TRUE             2  9.5387
31    45   4.0495   TRUE             2 13.4023
32    67   3.3180   TRUE             2 16.7203
33    56   2.1792   TRUE             2 20.0383
34    67   1.7837   TRUE             2 21.8221
35    44   0.7964  FALSE            NA      NA
36    54   0.0062  FALSE            NA      NA
37    23   0.0066  FALSE            NA      NA
38    34   1.5670   TRUE             3  1.5670
39    22   2.8854   TRUE             3  4.4524
40    12   3.5679   TRUE             3  7.3379
41    34   2.5784   TRUE             3  9.9163
42    22   1.1111   TRUE             3 12.4947
43     1   0.7853  FALSE            NA      NA
44    12   0.2000  FALSE            NA      NA
45    12   0.0005  FALSE            NA      NA

一旦我有了所需的结果列,我就想使用cumsum(df$DesiredResult)函数为累积距离添加第二列,但是嵌套在距离> 1的块中(请参阅所需的输出)。

我真的很难为此编写规则,所以任何帮助都会非常感激。谢谢!

2 个答案:

答案 0 :(得分:3)

如果您数据的前几行始终包含Distance < 1,则可以rleid(OnTrip==TRUE)/2使用data.table生成id序列,通过该序列我们可以计算累积总和:

library(data.table)
setDT(df)[,DesiredResult:=rleid(OnTrip==TRUE)/2][
  OnTrip==FALSE, DesiredResult:=NA][
  OnTrip==TRUE, CumDist:=cumsum(Distance),by = "DesiredResult"]

> df
 #   Count Distance OnTrip desired DesiredResult CumDist
 #1:    45   0.0066  FALSE       1            NA      NA
 #2:    76   0.4170  FALSE       2            NA      NA
 #3:    23   0.8940  FALSE       3            NA      NA
 #4:     2   1.2890   TRUE       4             1  1.2890
 #5:    67   1.7645   TRUE       5             1  3.0535
 #6:    34   2.1770   TRUE       6             1  5.2305
 #7:   103   3.7475   TRUE       7             1  8.9780
 #8:    34   4.2306   TRUE       8             1 13.2086
 #9:    88   4.7326   TRUE       9             1 17.9412
#10:    45   5.2144   TRUE      10             1 23.1556
#11:    76   6.5687   TRUE      11             1 29.7243
#12:    23   6.9534   TRUE      12             1 36.6777
#13:    11   7.0745   TRUE      13             1 43.7522
#14:    56   7.1011   TRUE      14             1 50.8533
#15:    22   6.5430   TRUE      15             1 57.3963
#16:    23   5.0043   TRUE      16             1 62.4006
#17:    45   5.0001   TRUE      17             1 67.4007
#18:    66   4.7842   TRUE      18             1 72.1849
#19:     7   3.2200   TRUE      19             1 75.4049
#20:     8   2.0045   TRUE      20             1 77.4094
#21:     2   1.7650   TRUE      21             1 79.1744
#22:    34   0.2875  FALSE      22            NA      NA
#23:    31   0.0001  FALSE      23            NA      NA
#24:    45   0.0069  FALSE      24            NA      NA
#25:    77   0.0756  FALSE      25            NA      NA
#26:     2   0.7666  FALSE      26            NA      NA
#27:    34   1.0898   TRUE      27             2  1.0898
#28:    11   2.2927   TRUE      28             2  3.3825
#29:    10   2.9022   TRUE      29             2  6.2847
#30:     2   3.8636   TRUE      30             2 10.1483
#31:    45   4.0495   TRUE      31             2 14.1978
#32:    67   3.3180   TRUE      32             2 17.5158
#33:    56   2.1792   TRUE      33             2 19.6950
#34:    67   1.7837   TRUE      34             2 21.4787
#35:    44   0.7964  FALSE      35            NA      NA
#36:    54   0.0062  FALSE      36            NA      NA
#37:    23   0.0066  FALSE      37            NA      NA
#38:    34   1.5670   TRUE      38             3  1.5670
#39:    22   2.8854   TRUE      39             3  4.4524
#40:    12   3.5679   TRUE      40             3  8.0203
#41:    34   2.5784   TRUE      41             3 10.5987
#42:    22   1.1111   TRUE      42             3 11.7098
#43:     1   0.7853  FALSE      43            NA      NA
#44:    12   0.2000  FALSE      44            NA      NA
#45:    12   0.0005  FALSE      45            NA      NA

答案 1 :(得分:1)

以下是基础R中的替代方案,它使用rleifelse来获取第一个变量,使用aveis.na<-来构建第二个变量。

# get cumulative sum and NA combination by OnTrip
temp <- rle(df$OnTrip)
temp$values <- ifelse(temp$values, cumsum(temp$values), NA)

# add desired result variable
df$DesiredResult <- inverse.rle(temp)

# get cumulative distance
df$CumDist <- ave(df$Distance, df$DesiredResult, FUN=cumsum)
is.na(df$CumDist) <- is.na(df$DesiredResult)

返回

head(df)
  Count Distance OnTrip DesiredResult CumDist
1    45   0.0066  FALSE            NA      NA
2    76   0.4170  FALSE            NA      NA
3    23   0.8940  FALSE            NA      NA
4     2   1.2890   TRUE             1  1.2890
5    67   1.7645   TRUE             1  3.0535
6    34   2.1770   TRUE             1  5.2305

请注意,最后两行可以折叠成一行,再次使用ifelse

df$CumDist <- ifelse(df$OnTrip, ave(df$Distance, df$DesiredResult, FUN=cumsum), NA)

我怀疑使用ifelseave对大型data.frames更有效,但ifelse方法对于这些数据来说似乎更快一点。 (两个衬垫的中位数为165ms,ifelse方法的中位数为153ms。