如何在R中第一次出现另一个条件后根据条件的第二次出现从列中获取值?

时间:2014-07-03 13:46:07

标签: r

数据

以下是示例数据框:

> dput(df)
structure(list(Vehicle.ID = c(21L, 21L, 21L, 21L, 21L, 21L, 21L, 
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 
45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 
45L, 45L, 45L, 45L, 45L, 45L, 45L), gap.dist = c(36L, 37L, 38L, 
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 
52L, 53L, 54L, 55L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L), safept = c("no", 
"no", "no", "no", "dx_safe+CC2", "no", "no", "no", "no", "dx_safe", 
"no", "no", "no", "no", "no", "dx_safe+CC2", "no", "no", "dx_safe", 
"no", "no", "no", "no", "no", "dx_safe+CC2", "no", "no", "no", 
"no", "dx_safe", "no", "no", "no", "no", "no", "no", "no", "no", 
"dx_safe", "no")), .Names = c("Vehicle.ID", "gap.dist", "safept"
), row.names = c(NA, -40L), class = "data.frame")

目标

我想创建2列。第一列是safetylower,在gap.dist列的Vehicle.ID的第一次出现时,"dx_safe"的值应为safept。第二列是safetyupper,其中应包含:

  • gap.dist在第一次出现时Vehicle.ID的值 "dx_safe+CC2" dx_safe首次出现后"dx_safe+CC2"(值为 之前发现)。这适用于有任何事件发生的情况 第一次出现"dx_safe"之后gap.dist
  • 或者给定Vehicle.ID
  • > dput(df) structure(list(Vehicle.ID = c(21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L), gap.dist = c(36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L), safept = c("no", "no", "no", "no", "dx_safe+CC2", "no", "no", "no", "no", "dx_safe", "no", "no", "no", "no", "no", "dx_safe+CC2", "no", "no", "dx_safe", "no", "no", "no", "no", "no", "dx_safe+CC2", "no", "no", "no", "no", "dx_safe", "no", "no", "no", "no", "no", "no", "no", "no", "dx_safe", "no"), safetylower = c(45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34), safetyupper = c(51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44)), .Names = c("Vehicle.ID", "gap.dist", "safept", "safetylower", "safetyupper"), row.names = c(NA, -40L), class = "data.frame") 的最后一个值

因此,所需的输出如下:

期望输出

safetylower

我尝试了什么

我只能使用match创建第一列library(plyr) df <- ddply(df, 'Vehicle.ID', transform, safetylower = gap.dist[match('dx_safe', safept)], safetyupper = gap.dist[match('dx_safe+CC2', safept)]) 。以下显示了我尝试过的没有达到目标的代码。请帮忙。

dx_safe

修改

如果有多套dx_safe+CC2df <- data.frame(Vehicle.ID=rep(c(5,6),each= 50), gap.dist = rep(seq(from=10, to=59), 2), safept = rep(c(rep('no', 5), 'dx_safe+CC2', rep('no', 4), 'dx_safe', rep('no', 3), 'dx_safe+CC2', rep('no', 5), 'dx_safe', rep('no', 28), 'dx_safe+CC2'), 2)) 怎么办?请考虑以下数据框:

gap.dist

基于两个答案中提供的相同代码(它们都完美无缺地工作),我如何只考虑较长的集合(中间行数最长的集合)并获取safetylower的值{ {1}}和safetyupperVehilce.ID s)?输出应为:

> dput(df)
structure(list(Vehicle.ID = c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 
6, 6, 6, 6, 6), gap.dist = c(10L, 11L, 12L, 13L, 14L, 15L, 16L, 
17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 
30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 
43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 
56L, 57L, 58L, 59L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 
19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 
45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 
58L, 59L), safept = structure(c(3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 
3L, 3L, 1L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 2L, 
3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L), .Label = c("dx_safe", 
"dx_safe+CC2", "no"), class = "factor"), safetylower = c(30, 
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 
30, 30, 30), safetyupper = c(59, 59, 59, 59, 59, 59, 59, 59, 
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59)), .Names = c("Vehicle.ID", 
"gap.dist", "safept", "safetylower", "safetyupper"), row.names = c(NA, 
-100L), class = "data.frame")

2 个答案:

答案 0 :(得分:1)

使用split()

进行分而治之的方法怎么样?
unsplit(lapply(split(df, df$Vehicle.ID), function(x) {
    lower <- which(x$safept=="dx_safe")[1]
    upper <- Filter(function(x) x>lower, which(x$safept=="dx_safe+CC2"))[1]
    if(is.na(upper)) {
            upper = nrow(x)
    }
    cbind(x, safetylower=x$gap.dist[lower], safetyupper=x$gap.dist[upper])
}), df$Vehicle.ID)

这里我们基本上为每个&#34; Vehicle.ID&#34;创建一个data.frame。然后我用你的定义为&#34; gap.dist&#34;的每个值找到合适的行索引。最后,我将这些值添加回data.frame,然后将unsplit()数据添加到数据中以恢复订单。

答案 1 :(得分:1)

我认为您使用match的想法是正确的,但您需要更多代码行。我会坚持使用ddply,因为你已经使用了它。

ddply(df, .(Vehicle.ID), function(d) {
    i <- match("dx_safe", d$safept) # first match of "dx_safe"
    j <- i + match("dx_safe+CC2", d$safept[(i+1):nrow(d)]) 
        # first match of "dx_safe+CC2" after first match of "dx_safe"
    if(is.na(j)) j <- nrow(d) # if no match, set equal to the last entry
    transform(d, safetylower = d$gap.dist[i],
        safetyupper = d$gap.dist[j])
})

请注意,如果有可能&#34; dx_safe&#34;这可能会有问题,需要调整。对于某个d$safeptVehicle.ID中根本没有出现,但是根据您的问题的措辞,我认为它总是如此。

此外,+1为结构良好的问题:)

修改的 如果你有很多&#34;对&#34; &#34; dx_safe&#34;和&#34; dx_safe + CC2&#34;并希望比较所有&#34;距离&#34;他们之间并选择最大的一个:

ddply(df, .(Vehicle.ID), function(d) {
    i <- which(d$safept == "dx_safe") # matches of "dx_safe"
    if (!length(i)) # if no matches of "dx_safe"
        return(transform(d, safetylower = NA, safetyupper = NA))
    j <- which(d$safept == "dx_safe+CC2") # matches of "dx_safe+CC2"
    j <- j[j > i[1]] # discard occurences before first "dx_safe"
    if (!length(j)) { # if no occurences of "dx_safe+CC2"
        lower.index <- i[1] 
        upper.index <- nrow(d)
    } else {
        intervals <- findInterval(j, i)
        distances <- sapply(j, function(x) x - max(i[i < x]))
        max.dist <- max(distances[!duplicated(intervals)])
        index <- match(max.dist, distances)
        lower.index <- i[index]
        upper.index <- j[index]
    }             
    return(transform(d, safetylower = d$gap.dist[lower.index],
        safetyupper = d$gap.dist[upper.index]))
})

在上面测试非重复间隔的原因是我们不希望允许在一个&#34; dx_safe&#34;之间发生距离。和一个&#34; dx_safe + CC2&#34;如果有另一个&#34; dx_safe + CC2&#34;则被选为最大值。它们之间。是对的吗?即如果你有一个向量c("dx_safe", "no", "no", "dx_safe+CC2", "no", "dx_safe+CC2"),则距离计算为3而不是5.请告诉我这是不是你的想法。请在使用前仔细测试,因为我没有数据,并且无法验证它是否符合预期的所有边缘情况,但我认为它应该涵盖它们。