比较R中不同行的列

时间:2018-03-15 08:53:01

标签: r dplyr igraph

在使用dplyr的R中,我正在努力通过序列累积两列。

我想做什么:

在每个Outlet内我试图计算累计DFLSEcr(累计DFLSEcr = cumu_DFLSEcr)和countcumu_count)对于基于ZHYDNextDown序列的每一行。每行都有NextDown的值,该值对应于匹配ZHYD表示之前的行。这会产生DFLSEcrcount累积的序列。 Exutoire == 0然后cumu_DFLSEcr == 0cumu_count == 0的位置。如果DFLSEcr == 1 or NA则不包括在总和中。我使用过lag(),但我不认为这是正确的......

输入:

input <- structure(list(ZHYD = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 
10L, 8L, 9L, 11L), .Label = c("B020006183", "B020006184", "B020006185", 
"B020006190", "B020006199", "B020006212", "B020006228", "B020006278", 
"B020006285", "B020006290", "B020006325"), class = "factor"), 
    Outlet = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
    2L, 3L), .Label = c("BSO0001727", "BSO0001746", "BSO0001756"
    ), class = "factor"), NextDown = structure(c(1L, 1L, 2L, 
    2L, 3L, 3L, NA, NA, 4L, 4L, 5L), .Label = c("B020006190", 
    "B020006199", "B020006228", "B020006290", "B020006335"), class = "factor"), 
    count = c(15L, 55L, 42L, 19L, 32L, 6L, 19L, 49L, 4L, 82L, 
    5L), DFLSEcr = c(0.07, 0.02, 0.02, 0.05, 0.03, 0.17, 0.05, 
    0.02, 0.25, 0.01, NA), Exutoire = c(0L, 0L, 0L, 0L, 0L, 0L, 
    1L, 1L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, 
-11L), .Names = c("ZHYD", "Outlet", "NextDown", "count", "DFLSEcr", 
"Exutoire"))

到目前为止尝试的方法:

input %>% 
  group_by(Outlet) %>% 
  select(ZHYD, NextDown, ZHYD, Outlet, Exutoire, count, DFLSEcr) %>%
  mutate(cleanDFLSEcr = replace(DFLSEcr, DFLSEcr == 1, 0), 
         cleanDFLSEcr = replace(DFLSEcr, is.na(DFLSEcr), 0), 
         cumu_DFLSEcr = if_else(Exutoire == 1, 0, cumsum(cleanDFLSEcr)) %>% lag(1, default = 0), 
         cumu_count = if_else(Exutoire == 1, 0, as.numeric(cumsum(count))) %>% lag(1, default = 0)) %>%
  select(-cleanDFLSEcr)

期望的输出:

   ZHYD       Outlet     NextDown   count DFLSEcr Exutoire cumu_count cumu_DFLSEcr
   <fct>      <fct>      <fct>      <int>   <dbl>    <int>      <int>        <dbl>
 1 B020006183 BSO0001727 B020006190    15  0.0700        0         70       0.130 
 2 B020006184 BSO0001727 B020006190    55  0.0200        0         70       0.130 
 3 B020006185 BSO0001727 B020006199    42  0.0200        0         51       0.0800
 4 B020006190 BSO0001727 B020006199    19  0.0500        0         51       0.0800
 5 B020006199 BSO0001727 B020006228    32  0.0300        0         19       0.0500
 6 B020006212 BSO0001727 B020006228     6  0.170         0         19       0.0500
 7 B020006228 BSO0001727 <NA>          19  0.0500        1          0       0.    
 8 B020006290 BSO0001746 <NA>          49  0.0200        1          0       0.    
 9 B020006278 BSO0001746 B020006290     4  0.250         0          1       0.0200
10 B020006285 BSO0001746 B020006290    82  0.0100        0          1       0.0200
11 B020006325 BSO0001756 B020006335     5 NA             0          1       0.0200
在某些情况下,

cumu_countcumu_DFLSEcr输出相同,因为它们共享相同的NextDown

序列中的最后一行不应包含在cumu_count中。所以对于第一行cumu_count == 19 +32 + 19 = 70

修改

原来我需要igraph,因为这是一个路由问题。

1 个答案:

答案 0 :(得分:1)

好的,你的问题并不简单。您有嵌套数据,操作起来有点复杂。 我给你一个答案。肯定有更好的方法来做到这一点。但它可以给你一些想法。


library(dplyr)
library(tidyr)
father_son_table <-  select(input, actual = ZHYD, father = NextDown)

sequences <- rename(input, actual = ZHYD, father = NextDown) %>% 
  left_join(father_son_table, by = c("father" = "actual"), suffix= c(".1", ".2")) %>% 
  left_join(father_son_table, by = c("father.2" = "actual"), suffix = c(".1", ".3")) %>%  
  tibble::rowid_to_column(var = "sequence_number")



table_order <- sequences  %>% 
  select(-count, -DFLSEcr, -Exutoire, -Outlet) %>% 
  gather(key = height, value = node, -sequence_number) %>%  
  mutate(order = case_when( height == "actual" ~ 0,
                            height =="father.1" ~ 1,
                            height == "father.2" ~ 2,
                            height == "father.3" ~ 3 )) %>%  
  na.omit() %>% 
  select(sequence_number, node, order)

result <- left_join(table_order, input, by = c("node" = "ZHYD")) %>%  
  arrange(sequence_number, order) %>%  
  group_by(sequence_number, Outlet) %>% 
  mutate(cumu_count = sum(count) - count, 
        cumu_DFLSE_cr = sum(DFLSEcr)- DFLSEcr) %>% 
  filter(order == 0)