如何将函数应用于dplyr group_by()中的整个df?

时间:2014-10-20 23:47:44

标签: r dplyr

我有这些功能:

foo <- function(z){

  bob <- which(z$signchg!=0)
  z$crit1 <- "opening"
  ifelse(length(bob)==0, z$crit1 <- "opening",
         ifelse(length(bob)==1,
                z$crit1[match(min(bob, na.rm=T), as.numeric(rownames(z)))] <- "opening",
                z$crit1[match(min(bob, na.rm=T), as.numeric(rownames(z))):match(max(bob[bob!=max(bob, na.rm=T)], na.rm=T), as.numeric(rownames(z)))] <- "unconscious follow"))
  z$crit1
}


foo2 <- function(y){
  bob <- which(y$signchg!=0)
  y$crit2 <- "opening"
  ifelse(length(bob)!=0,
  ifelse(length(y[y$crit1=="unconscious follow",]$sacc)==0,
         y$crit2[match(max(bob, na.rm=T), as.numeric(rownames(y))):nrow(y)] <- "opening",
         ifelse(length(head(which(y$sacc>max(y[y$crit1=="unconscious follow",]$sacc, na.rm=T)),1))==0, y$crit2 <- "opening",
         y$crit2[match(max(bob, na.rm=T), as.numeric(rownames(y))):head(which(y$sacc>max(y[y$crit1=="unconscious follow",]$sacc, na.rm=T)),1)] <- "unconscious follow")),
  y$crit2 <- "opening")
  y$crit2
}

foo3 <- function(x){
  bob <- which(x$signchg!=0)
  x$closing <- "opening"
  ifelse(length(bob)!=0, 
  x$closing[1:match(min(bob), as.numeric(rownames(x)))-1] <- "closing", x$closing <- "opening")
  x$closing
}

数据

以下是包含3个唯一Vehicle.ID(8,12和1179)的数据集。我拍了50行的样本:

> dput(ntraj1oo)
structure(list(Vehicle.ID = c(1179L, 12L, 12L, 1179L, 1179L, 
1179L, 8L, 1179L, 1179L, 1179L, 8L, 1179L, 12L, 1179L, 12L, 8L, 
1179L, 12L, 1179L, 1179L, 12L, 8L, 8L, 1179L, 1179L, 8L, 8L, 
12L, 1179L, 1179L, 12L, 1179L, 8L, 12L, 1179L, 1179L, 1179L, 
12L, 1179L, 12L, 1179L, 1179L, 12L, 12L, 8L, 1179L, 12L, 1179L, 
12L, 1179L), Frame.ID = c(3145L, 225L, 169L, 3549L, 3258L, 3262L, 
289L, 3246L, 3155L, 3316L, 74L, 3124L, 135L, 3398L, 434L, 342L, 
3288L, 93L, 3221L, 3384L, 293L, 347L, 452L, 3301L, 3165L, 448L, 
230L, 400L, 3343L, 3302L, 305L, 3242L, 333L, 181L, 3362L, 3201L, 
3356L, 150L, 3466L, 129L, 3123L, 3513L, 124L, 234L, 265L, 3440L, 
407L, 3497L, 454L, 3208L), sacc = c(1.2024142815693, 0.167471842386292, 
0.389526218261013, 1.0608535451082, 1.34658348989163, 1.30827746568167, 
0.676275947080881, 1.56168338812933, 1.45322442414619, 0.236926713182157, 
-0.331746789624733, -0.296457890957575, 0.578696068042145, -0.104188799716241, 
1.64373161583451, 0.74974701439042, 1.024635813019, -0.212898242245164, 
1.54066066716165, -0.439030115502196, -0.0908376863222584, 0.691762173865882, 
0.0956005839166526, 0.681722722129702, 1.44251516088868, -0.0772419385643099, 
0.430003386843667, 1.05958689269776, -0.402975701449174, 0.648704793894625, 
-0.0106984134869645, 1.63176231974786, 0.884756294567357, 0.219219760305613, 
-0.428935665947576, 1.54207226189423, -0.40185390261026, 0.441773747246007, 
0.983291264446801, 0.596528992338635, -0.351283490561794, 1.11356697363866, 
0.64253447660771, 0.0491453453593057, 0.715465534653409, 0.760489329987362, 
1.17711496285387, 1.07374138870048, 1.45061613430159, 1.5589484008358
), relative.v = c(-7.20683108836496, 1.41754770518283, -0.298659684886637, 
-6.37538134834612, -4.00321428084874, -3.82309181190075, -0.727408127343359, 
-4.14013093963352, -6.7253476528766, 4.84058965232001, -2.51365849828336, 
-4.82796782714515, -2.2317642496626, -1.54138020745749, -2.91023536393949, 
-0.904299522098896, -0.549568281350204, -2.99526240263305, -6.18033016152812, 
1.08350055196426, 2.52903114154146, -1.01292990996659, -2.54795991136474, 
2.14686490991681, -7.03361953812604, -1.24128349787506, -0.149590211893916, 
-4.29601660568767, 4.70617725169663, 2.47874406770293, -0.442134244952982, 
-4.72366659693532, -1.10949949758366, 0.850218831661735, 2.42271763669292, 
-8.2259447855115, 1.44195914620509, -1.88517424984066, -6.48099656406857, 
-3.22006152601574, -4.53955604248154, -7.95149284172251, -3.95841822705948, 
0.978824881565963, -0.832249768583615, -3.99216317969555, -4.56499371815966, 
-5.89675705778252, -0.269620247442631, -7.75907851102451), nspacing = c(67.9564390167725, 
64.4222965548587, 69.9984793222568, 203.630967606615, 142.825962756316, 
144.4974871287, 69.5663930132816, 138.544960496636, 75.1355363890009, 
145.313025161387, 62.76071823522, 52.3376957871262, 63.854711706948, 
119.303164791766, 82.7183786313178, 78.0100285715123, 151.786017600382, 
41.6146093571944, 124.898333310041, 118.810008693412, 57.9329927929634, 
78.1975432716604, 97.9377561743831, 151.845647043811, 81.0478415333349, 
97.4581470183944, 63.9970348761168, 67.6721711092462, 129.125820950528, 
151.636781319948, 56.1796449012404, 136.907951327661, 77.12358891961, 
68.5284958380145, 126.438422026932, 109.685235806325, 126.52282899785, 
65.3271870401025, 148.692268232249, 62.3990368362372, 51.846063554017, 
178.498350166457, 60.768801672643, 62.2994121863875, 69.1176002124943, 
135.401524339836, 71.0466952274176, 167.365284062391, 85.027302124975, 
115.693182668085)), class = c("tbl_df", "tbl", "data.frame"), .Names = c("Vehicle.ID", 
"Frame.ID", "sacc", "relative.v", "nspacing"), row.names = c(901L, 
606L, 550L, 1305L, 1014L, 1018L, 261L, 1002L, 911L, 1072L, 46L, 
880L, 516L, 1154L, 815L, 314L, 1044L, 474L, 977L, 1140L, 674L, 
319L, 424L, 1057L, 921L, 420L, 202L, 781L, 1099L, 1058L, 686L, 
998L, 305L, 562L, 1118L, 957L, 1112L, 531L, 1222L, 510L, 879L, 
1269L, 505L, 615L, 237L, 1196L, 788L, 1253L, 835L, 964L))

应用函数会产生错误

现在,对此数据应用函数会产生错误:

ovv <- ntraj1oo %>% 

  group_by(Vehicle.ID) %>% 

  mutate(signs = sign(relative.v), 
         signchg = c(NA, diff(signs))) %>%

  do(data.frame(Frame.ID=.$Frame.ID,crit1=foo(.), crit2=foo2(.), closing=foo3(.))) %>%

  inner_join(x=ntraj1oo, y=., by=c("Vehicle.ID", "Frame.ID")) %>%

  mutate(behavior = ifelse(crit1=="unconscious follow" |crit2=="unconscious follow", "Unconscious Following",
                           ifelse(closing=="closing" & relative.v>0, "closing", 
                                  ifelse(closing=="closing" & relative.v<0, "Unconscious Following", "opening")))) %>%
  ungroup()

错误

Error in match(max(bob, na.rm = T), as.numeric(rownames(y))):nrow(y) : 
  NA/NaN argument

但仅使用1辆车的数据不会产生错误。我分别测试了车辆8,12和1179,没有错误 这些只有3辆车,共50排。如果我在具有944 Vehicle.IDs的原始数据集上应用这些函数,我会收到以下错误:

Error in match(min(bob, na.rm = T), as.numeric(rownames(z))):match(max(bob[bob !=  : 
  NA/NaN argument

同样,使用1辆车的完整数据不会产生任何错误。当Vehicle.ID超过1时,为什么dplyr不应用函数?

1 个答案:

答案 0 :(得分:0)

group_by()失败时,我们恢复了老式的行索引和for循环:

# Create new cols, with pessimism
ovv$crit1 <- NA
ovv$crit2 <- NA
ovv$closing <- NA
ovv$behavior <- NA

for (vid in uniq(ovv$Vehicle.ID)) {
   ovv $newcol
   # Form a row-index
   I <- which(ovv$Vehicle.ID == vid)

   # Apply your fns, vid-wise...
   ovv[I,]$crit1 <- foo(ovv[I,])
   ovv[I,]$crit2 <- foo2(ovv[I,])
   ovv[I,]$closing <- foo3(ovv[I,])
}

ovv$behavior <- ifelse(...)