Question

我编写了一些代码来计算一小部分鱼类的旅行时间。我每条鱼的“路径”的数据框d如下所示：

  TagID  Station  arrival           departure
1  2051   I80_1 2012-04-04 20:20:04 2012-04-04 20:35:04
2  2051  Lisbon 2012-04-05 09:06:18 2012-04-05 09:11:36
3  2051    Rstr 2012-04-05 18:46:34 2012-04-05 19:03:21
4  2051    Rstr 2012-04-05 22:31:59 2012-04-05 22:51:09
5  2051    Rstr 2012-04-06 02:30:31 2012-04-06 02:54:01
6  2051 Base_TD 2012-04-06 06:52:39 2012-04-06 08:24:11

我的代码提取每条鱼的最终站的最终到达时间（例如，在上面的片段中，它将在2012-04-06 06:52:39站Base_TD处）。

一旦我确定了最终到达时间，我总结ttime为每条鱼的releasetime（预设值）所经过的总时间，以及每条鱼的最终站位置。我使用dplyr在整个数据集上用以下管道完成了这个，但dplyr是我知道完成此任务的唯一方法，我担心我会传播不可见的错误所有分组和取消分组。这是一个有效的问题吗？我如何在基数R中编写等效代码以确保得到相同的结果？

releasetime <- as.POSIXct('2012-03-30 18:00:00', tz = 'Pacific/Pitcairn')
releasetime <- lubridate::with_tz(releasetime, tzone = 'UTC')

tt <- d %>% 
  group_by(TagID, Station) %>% 
  arrange(arrival) %>% 
  slice(row_number() == 1) %>%  # cuts df down to first detection of fish at each station
  ungroup() %>% 
  group_by(TagID) %>% # get back up to full path level
  arrange(arrival) %>%  #arrange path by arrival time
  summarise(ttime = last(arrival) - releasetime,
         laststation = Station[arrival ==last(arrival)]) # now the last arrival should be the only arrival at the last station; summarize travel time for each fish.

如果您想要使用样本数据集，这里有三个不同个体的dput()个跟踪：

d <- structure(list(TagID = c(2059L, 2059L, 2059L, 2059L, 2059L, 2059L, 
2059L, 2059L, 2059L, 2059L, 2059L, 2062L, 2062L, 2062L, 2062L, 
2062L, 2062L, 2062L, 2062L, 2062L, 2062L, 2066L, 2066L, 2066L, 
2066L, 2066L, 2066L, 2066L, 2066L, 2066L, 2066L, 2066L, 2066L, 
2066L), Station = c("I80_1", "Lisbon", "Rstr", "Rstr", "Base_TD", 
"BCE", "MAE", "MAW", "MAW", "MAE", "MAE", "I80_1", "Lisbon", 
"Rstr", "Base_TD", "BCE", "BCE", "BCE", "BCE", "BCE", "BCE", 
"I80_1", "Lisbon", "Rstr", "BCE", "BCE", "BCE", "MAE", "MAW", 
"MAW", "MAE", "MAE", "MAW", "MAE"), arrival = structure(c(1333451872, 
1333562100, 1333607351, 1333626207, 1333642897, 1333725713, 1334092156, 
1334092450, 1334102208, 1334102426, 1334169836, 1333232026, 1333301118, 
1333364285, 1333383477, 1333729987, 1333746859, 1333788503, 1333844040, 
1333857104, 1333884034, 1333184935, 1333229762, 1333270977, 1333503027, 
1333533868, 1333542226, 1333822681, 1333823087, 1333832661, 1333832863, 
1333861226, 1333861662, 1333877063), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), departure = structure(c(1333452194, 1333562472, 
1333608264, 1333626844, 1333643196, 1333725773, 1334092599, 1334093077, 
1334102905, 1334103169, 1334169868, 1333232307, 1333301776, 1333366712, 
1333385467, 1333730036, 1333746859, 1333788634, 1333844585, 1333857123, 
1333884226, 1333185124, 1333230300, 1333272832, 1333503224, 1333535705, 
1333542296, 1333823638, 1333824235, 1333832964, 1333833171, 1333861898, 
1333862298, 1333877179), class = c("POSIXct", "POSIXt"), tzone = "UTC")), class = "data.frame", row.names = c(NA, 
-34L), .Names = c("TagID", "Station", "arrival", "departure"))

正确的输出应该是：

TagID ttime          laststation
 2059 10.801505 days         MAW
 2062  6.606331 days         BCE
 2066  7.683877 days         MAW

非常感谢你的帮助。

Answer 1

我们可以尝试使用split

中的base R

r1 <- do.call(rbind, lapply(split(d, list(d$TagID, d$Station),
       drop = TRUE), function(x) head(x[order(x$arrival),],1)))
r2 <- do.call(rbind, lapply(split(r1, r1$TagID), function(x) {
     x1 <- x[order(x$arrival),]
     data.frame(TagID = x1$TagID[1], 
                ttime = x1$arrival[nrow(x1)] - releasetime, 
                laststation = x1$Station[x1$arrival == x1$arrival[nrow(x1)]])}))

row.names(r2) <- NULL
r2
#  TagID          ttime laststation
#1  2059 10.801505 days         MAW
#2  2062  6.606331 days         BCE  
#3  2066  7.683877 days         MAW

从R中的一系列分组变量中提取第一个值以计算行程时间

2 个答案: