如何根据条件提取列名?

时间:2018-09-12 15:52:05

标签: r dplyr purrr

考虑这个简单的例子

mytest <- data_frame(group = c('a', 'a', 'a', 'b', 'b', 'b'),
                     x = c(NA,NA,NA,5,6,7),
                     other_var = c(NA, NA, NA, 1,2,3),
                     y = c(3,5,6,NA,NA,NA),
                     another_var = c(1,2,3, NA,NA,NA),
                     label_x = c('hello','hello','hello','world','world','world'),
                     label_y =c('bada','bada','bada','boom','boom','boom'),
                     label_other_var = c('ak','ak','ak','run','run','run'),
                     label_another_var = c('noo','noo','noo','bie','bie','bie'))

# A tibble: 6 x 9
  group     x other_var     y another_var label_x label_y label_other_var label_another_var
  <chr> <dbl>     <dbl> <dbl>       <dbl> <chr>   <chr>   <chr>           <chr>            
1 a        NA        NA     3           1 hello   bada    ak              noo              
2 a        NA        NA     5           2 hello   bada    ak              noo              
3 a        NA        NA     6           3 hello   bada    ak              noo              
4 b         5         1    NA          NA world   boom    run             bie              
5 b         6         2    NA          NA world   boom    run             bie              
6 b         7         3    NA          NA world   boom    run             bie 

在这里,我需要用nest() grouplabel_,并且能够提取不是NA的变量(在每个嵌套数据框中)的列名。诀窍是,变量的实际名称显示在# A tibble: 4 x 2 group var <chr> <chr> 1 a bada 2 a noo 3 b world 4 b run

例如,这是所需的输出:

a

确实,参加y组。只有一个非缺失变量是another_vary。但是,bada的名称为label_y(如another_var变量所示),noo的名称为b。对于map的推理也是如此。

运行后,我不知道如何通过mytest %>% group_by(group) %>% nest() # A tibble: 2 x 2 group data <chr> <list> 1 a <tibble [3 x 8]> 2 b <tibble [3 x 8]> 调用

 mytest <- data_frame(group = c('a', 'a', 'a', 'b', 'b', 'b'),
+                      x = c(NA,NA,NA,5,6,7),
+                      y = c(3,5,6,NA,NA,NA),
+                      label_x = c('hello','hello','hello','world','world','world'),
+                      label_y =c('bada','bada','bada','boom','boom','boom'))

有什么想法吗? 谢谢!

编辑:最初的,较小的,有问题的提议如下

private String getCallDetails()
{
    StringBuilder sb = new StringBuilder();
    Cursor managedCursor = managedQuery(CallLog.Calls.CONTENT_URI,
            null, null, null,null);
    int number = managedCursor.getColumnIndex(CallLog.Calls.NUMBER);
    int type = managedCursor.getColumnIndex(CallLog.Calls.TYPE);
    int date = managedCursor.getColumnIndex(CallLog.Calls.DATE);
    int duration = managedCursor.getColumnIndex(CallLog.Calls.DURATION);
    while (managedCursor.moveToNext()) {
        String phNumber = managedCursor.getString(number);
        String callType = managedCursor.getString(type);
        String callDate = managedCursor.getString(date);
        Date callDayTime = new Date(Long.valueOf(callDate));
        String callDuration = managedCursor.getString(duration);
        String callDirection = null;
        int callDirectionCode = Integer.parseInt(callType);
        switch (callDirectionCode)
        {
            case CallLog.Calls.OUTGOING_TYPE:
                callDirection = "OUTGOING";
                break;
            case CallLog.Calls.INCOMING_TYPE:
                callDirection = "INCOMING";
                break;
            case CallLog.Calls.MISSED_TYPE:
                callDirection = "MISSED";
                break;
        }
        sb.append(callDirection+"||"+phNumber+"||"+callDuration+"||"+callDayTime+";\n");
    }
    managedCursor.close();
    return sb.toString();
}

2 个答案:

答案 0 :(得分:3)

nest分组后,通过提取map非NA元素,在summarisefirst的'label'列中循环, gather移至单个列,同时移除NAna.rm = TRUE)),select'var'列,然后执行unnest(仅保留之后感兴趣的列)

mytest %>%
  group_by(group) %>% 
  nest %>% 
  mutate(var = map(data, ~ 
                     .x %>%
                      summarise(label_x = label_x[!is.na(x)][1], 
                                label_y = label_y[!is.na(y)][1]) %>% 
                      gather(key, var, na.rm = TRUE) %>% 
                      select(var))) %>%
  select(-data) %>% 
  unnest
# A tibble: 2 x 2#
#  group var 
#  <chr> <chr>
#1 a     bada 
#2 b     world

更新

如果有更多列,请创建唯一的列名称,然后使用map2遍历相应的列名称

nm1 <- unique(sub("label_", "", setdiff(names(mytest), "group")))
nm2 <- paste0("label_", nm1)
mytest %>% 
   group_by(group) %>% 
   nest %>%
   mutate(var = map(data, ~ 
                    map2_chr(.x %>% 
                               select(nm1),
                             .x %>%
                              select(nm2), ~ 
                                .y[!is.na(.x)][1]) %>% 
                                   na.omit %>%
                                   tibble(var = .))) %>% 
    select(-data) %>%
    unnest
# A tibble: 4 x 2
#  group var  
#  <chr> <chr>
#1 a     bada 
#2 a     noo  
#3 b     world
#4 b     run  

答案 1 :(得分:1)

这将输出您想要的结果:

mytest <- data_frame(group = c('a', 'a', 'a', 'b', 'b', 'b'),
                     x = c(NA,NA,NA,5,6,7),
                     y = c(3,5,6,NA,NA,NA),
                     label_x = c('hello','hello','hello','world','world','world'),
                     label_y =c('bada','bada','bada','boom','boom','boom'))

extract_good_colnames <- function(df, subgroup){
  subset <- filter(df, group == subgroup)
  if(sum(is.na(subset$x)) > 0){
    colname = 'label_y'
  }else if(sum(is.na(subset$y)) > 0){
    colname = 'label_x'
  }
  return(tibble(group = subgroup, var = as.character(subset[1, colname])))
}

groups <- unique(mytest$group)
map_df(groups, function(x) extract_good_colnames(mytest, x))