Question

我有一个这样的数据框：

     Q1  Q1a    Q2    Q2a   Q2b
1   foo <NA>    fee   <NA>  <NA>
2   bar <NA>  other    ree  <NA>
3 other  roo    bee   <NA>  <NA>
4   bar <NA>    fee   <NA>  <NA>
5   bar <NA>  other    fee  <NA>
6 other  fee  other    <NA>  roo

我想替换任何＆＃39;其他＆＃39;使用同一行中连续列的值（即 Qx a），以便我可以删除稀疏列：

     Q1    Q2  
1   foo   fee  
2   bar   ree
3   roo   bee 
4   bar   fee 
5   bar   fee
6   fee   roo

我可以想象为这样一个列做这样的事情：

Lines = "
Q1  Q1a    Q2    Q2a   Q2b
foo NA fee  NA  NA
bar NA other  ree  NA
other roo  bee   NA  NA
bar NA    fee   NA  NA
bar NA    other fee  NA
other  fee  other NA  roo"

df = read.table(text = Lines, header = TRUE, as.is = TRUE)

df$Q1[df$Q1=='other'] = df$Q1a[df$Q1=='other']

然后循环遍历每一列，但这有点令人厌倦和缓慢，每列中有很多列和多个值。

有更简洁的方法吗？（另外，我的方法不会整齐地延伸到Q2a Q2b示例。）

Answer 1

我们可以split基于相似列的数据集，然后循环遍历list data.frame并根据使用{{max.col创建的索引替换'other'的值1}}

data.frame(lapply(split.default(df, sub("[a-z]+$", "", names(df))), function(x) {
          i1 <- x[,1] == "other"
          i2 <- x[-1] != "other" & !is.na(x[-1])
          x[,1][i1] <- x[-1][cbind(1:nrow(i2), (max.col(i2))* i1)]
       x[,1]
  }))
#   Q1  Q2
#1 foo fee
#2 bar ree
#3 roo bee
#4 bar fee
#5 bar fee
#6 fee roo

或tidyverse

library(dplyr)
library(purrr)
split.default(df, sub("[a-z]+$", "", names(df))) %>%
       map_df(~ replace(., .== 'other', NA) %>% 
                 do.call(paste, .) %>%
                 gsub("\\s*|(NA\\s*)+", "", .))
# A tibble: 6 x 2
#     Q1    Q2
#   <chr> <chr>
#1   foo   fee
#2   bar   ree
#3   roo   bee
#4   bar   fee
#5   bar   fee
#6   fee   roo

Answer 2

这不会像Q3a-Q3f那样扩展，但是一个快速而肮脏的解决方案将是...

library(tidyverse)

new_df <- df %>%
    mutate(Q1 = ifelse(Q1 == 'other', Q1a, Q1),
           Q2 = ifelse(Q2 == 'other', 
                       ifelse(!is.na(Q2a), Q2a, Q2b))) %>%
    select(Q1, Q2)

用多列r上的连续列替换值

2 个答案: