整洁:按组进行汇总,同时从每行中排除嵌套子组

时间:2018-07-09 03:58:04

标签: r dplyr

我想使用max和min来查看较大组中的哪些子组明显小于该特定较大组中的其余子组。

这太罗word了,所以举几个例子:

  x <- read.csv(text = 'grp,subgrp,num
              1,1,2
              1,1,3
              1,2,4
              1,2,6
              1,2,7
              2,3,7
              2,4,6
              2,4,7,
              2,5,7')

在组1中,子组1明显小于子组2,因为max(2,3)

我可以轻松地获得一个组的最大/最小和一个子组的最大/最小,但是如果我可以获得一个可以区分每一行子组的最大/最小组(例如dplyr :: mutate),那么我可以轻松地进行比较最大子组,其中有+排除子组min。就像在示例中一样。

我不知道的关键变量是mingrpexclsubgrp,整个结果看起来像:

  x <- read.csv(text = 'grp,subgrp,num,maxsubgrp,mingrpexclsubgrp,isless
                1,1,2,3,4,T
                1,1,3,3,4,T
                1,2,4,7,2,F
                1,2,6,7,2,F
                1,2,7,7,2,F
                2,3,7,7,6,F
                2,4,6,7,7,F
                2,4,7,7,7,F
                2,5,7,7,6,F')

maxsubgroup是:

x %>%
group_by(subgrp) %>%
mutate(maxsubgrp = max(num))

没有的将是:

x %>%
mutate(isless = maxsubgrp < mingrpexclsubgrp)

基于R的基本解决方案可以基于正确的合并,但是我希望有一个整洁和/或矢量化的版本。

for(i in unique(x$grp)){
  y <- x[x$grp == i, ]
  for(j in unique(y$subgrp)){
    print(paste(i,j))
    print(min(x$num[x$grp == i & x$subgrp != j]))
  }
}

4 个答案:

答案 0 :(得分:2)

这里是使用data.table

的选项
library(data.table)
setDT(x)[, {          
      sg <- .SD[['subgrp']]
      nm <- .SD[['num']]
        setnames(.SD[, .(max(num), min(nm[sg != subgrp])), subgrp],
                  2:3, c('maxsubgrp', 'mingrpexclsubgrp'))
        }, by = grp
        ][x, on = .(grp, subgrp)
         ][ , isless := maxsubgrp < mingrpexclsubgrp][]
#  grp subgrp maxsubgrp mingrpexclsubgrp num isless
#1:   1      1         3                4   2   TRUE
#2:   1      1         3                4   3   TRUE
#3:   1      2         7                2   4  FALSE
#4:   1      2         7                2   6  FALSE
#5:   1      2         7                2   7  FALSE
#6:   2      3         7                6   7  FALSE
#7:   2      4         7                7   6  FALSE
#8:   2      4         7                7   7  FALSE
#9:   2      5         7                6   7  FALSE

或使用tidyverse

library(tidyverse)
x %>% 
   split(.$grp, .$subgrp, drop = TRUE) %>%
   map_df(~ 
           .x %>%
              group_by(subgrp) %>%
              mutate(maxsubgrp = max(num), 
                     mingrpexclsubgrp = min(.$num[!.$subgrp %in% subgrp]), 
                     isless = maxsubgrp < mingrpexclsubgrp))
# A tibble: 9 x 6
# Groups:   subgrp [5]
#    grp subgrp   num maxsubgrp mingrpexclsubgrp isless
#  <int>  <int> <int>     <dbl>            <int> <lgl> 
#1     1      1     2         3                4 TRUE  
#2     1      1     3         3                4 TRUE  
#3     1      2     4         7                2 FALSE 
#4     1      2     6         7                2 FALSE 
#5     1      2     7         7                2 FALSE 
#6     2      3     7         7                6 FALSE 
#7     2      4     6         7                7 FALSE 
#8     2      4     7         7                7 FALSE 
#9     2      5     7         7                6 FALSE 

或在创建unnest列后使用list

x %>%
  group_by(grp, subgrp) %>% 
  group_by(maxsubgrp = max(num), add = TRUE) %>%
  summarise(num = list(num))   %>% 
  group_by(grp) %>% 
  mutate(mingrpexclsubgrp = map_int(row_number(), ~ 
                                         num[-.x] %>%    
                                         unlist %>% 
                                         min)) %>% 
  unnest %>%
  mutate(isless = maxsubgrp < mingrpexclsubgrp)
# A tibble: 9 x 6
# Groups:   grp [2]
#    grp subgrp maxsubgrp mingrpexclsubgrp   num isless
#  <int>  <int>     <dbl>            <int> <int> <lgl> 
#1     1      1         3                4     2 TRUE  
#2     1      1         3                4     3 TRUE  
#3     1      2         7                2     4 FALSE 
#4     1      2         7                2     6 FALSE 
#5     1      2         7                2     7 FALSE 
#6     2      3         7                6     7 FALSE 
#7     2      4         7                7     6 FALSE 
#8     2      4         7                7     7 FALSE 
#9     2      5         7                6     7 FALSE 

答案 1 :(得分:1)

使用setdiff

的方法略有不同
df %>%
    group_by(grp, subgrp) %>%
    mutate(
        maxsubgrp = max(num),
        num.subgrp = list(num)) %>%
    group_by(grp) %>%
    mutate(
        mingrpexclsubgrp = map_dbl(num.subgrp, function(x) {
            diff <- setdiff(num, x);
            if (length(diff) > 0) min(diff) else min(maxsubgrp) }),
        isless = maxsubgrp < mingrpexclsubgrp) %>%
    select(-num.subgrp)
## A tibble: 9 x 6
## Groups:   grp [2]
#    grp subgrp   num maxsubgrp mingrpexclsubgrp isless
#  <int>  <int> <int>     <dbl>            <dbl> <lgl>
#1     1      1     2        3.               4. TRUE
#2     1      1     3        3.               4. TRUE
#3     1      2     4        7.               2. FALSE
#4     1      2     6        7.               2. FALSE
#5     1      2     7        7.               2. FALSE
#6     2      3     7        7.               6. FALSE
#7     2      4     6        7.               7. FALSE
#8     2      4     7        7.               7. FALSE
#9     2      5     7        7.               6. FALSE

答案 2 :(得分:1)

//h3[contains(., 'Browse Publications') and contains(., 'filter')]

要获得结果作为您的表格,我们可以这样做:

String category = "//h3[contains(., 'Browse Publications') and contains(., 'filter')]";
WebElement settingSection = FindElements(By.XPath(category)).FirstOrDefault(x => x.Displayed);

答案 3 :(得分:0)

我知道了。按每一行进行汇总而不是进行突变是正确的。

x %>%
# the key variable and generalizable answer
group_by(rownum = 1:n(),grp,subgrp,num) %>%
summarize(
  mingrpexclsubgrp = min(c(.$num)[.$grp == grp & .$subgrp != subgrp])
) %>%
# the rest of the variables
group_by(subgrp) %>%
mutate(maxsubgrp = max(num),
       isless = maxsubgrp < mingrpexclsubgrp)

我相信它可以进一步简化:

x %>%
  group_by(subgrp) %>%
  mutate(
    # essentially ignores the grouping with .$ and uses custom grouping by comparison
    mingrpexclsubgrp = min(.$num[.$grp == grp[1] & .$subgrp != subgrp[1]]), # [1] prevent a warning where it chooses the first from >1 anyway
    maxsubgrp = max(num),
    isless = maxsubgrp < mingrpexclsubgrp
  )