有条件地用N中的数据帧中的字符串替换NA

时间:2018-05-31 20:59:14

标签: r

我的数据框看起来像这样:

df <- as.data.frame(matrix(c("True Organic", "True Organic", NA, NA, NA, 0,
         "True Organic", "True Organic", NA, NA, NA, 0,
         "Organic Search (SEO)", "Induced Organic", NA, NA, NA, 0,
         "Display", NA, NA, NA, NA, 0,
         "Social Ads (Act)", "Induced Organic", "Induced Organic", NA, NA, 1,
         "Referral", "Social Ads (Act)", NA, NA, NA, 0,
         "Special Emails", "Induced Organic", NA, NA, NA, 1,
         "Daily Email", "Daily Email", "Daily Email", NA, NA, 0), nrow = 8,
         ncol = 6, byrow = TRUE, dimnames = list(NULL, c("Node_1", "Node_2",
                                                         "Node_3", "Node_4",
                                                         "Node_5", "conversion"))), ,
         stringsAsFactors = FALSE)


df

                Node_1           Node_2          Node_3 Node_4 Node_5 conversion
1         True Organic     True Organic            <NA>   <NA>   <NA>          0
2         True Organic     True Organic            <NA>   <NA>   <NA>          0
3 Organic Search (SEO)  Induced Organic            <NA>   <NA>   <NA>          0
4              Display             <NA>            <NA>   <NA>   <NA>          0
5     Social Ads (Act)  Induced Organic Induced Organic   <NA>   <NA>          1
6             Referral Social Ads (Act)            <NA>   <NA>   <NA>          0
7       Special Emails  Induced Organic            <NA>   <NA>   <NA>          1
8          Daily Email      Daily Email     Daily Email   <NA>   <NA>          0

对于每一行,如果该行的转换列等于0,我想用字符串“Null”替换所有NAs,或者如果转换列等于1,则替换为“Conversion”。

我的最终输出应该如下所示。

df_desired <- as.data.frame(matrix(c("True Organic", "True Organic", "Null", "Null", "Null", 0,
                                 "True Organic", "True Organic", "Null", "Null", "Null", 0,
                                 "Organic Search (SEO)", "Induced Organic", "Null", "Null", "Null", 0,
                                 "Display", "Null", "Null", "Null", "Null", 0,
                                 "Social Ads (Act)", "Induced Organic", "Induced Organic", "Conversion", "Conversion", 1,
                                 "Referral", "Social Ads (Act)","Null", "Null", "Null", 0,
                                 "Special Emails", "Induced Organic", "Converison", "Conversion", "Conversion", 1,
                                 "Daily Email", "Daily Email", "Daily Email", "Null", "Null", 0), nrow = 8,
                               ncol = 6, byrow = TRUE, dimnames = list(NULL, c("Node_1", "Node_2",
                                                                               "Node_3", "Node_4",
                                                                               "Node_5", "conversion"))), ,
                        stringsAsFactors = FALSE)


df_desired 


               Node_1           Node_2          Node_3     Node_4     Node_5 conversion
1         True Organic     True Organic            Null       Null       Null          0
2         True Organic     True Organic            Null       Null       Null          0
3 Organic Search (SEO)  Induced Organic            Null       Null       Null          0
4              Display             Null            Null       Null       Null          0
5     Social Ads (Act)  Induced Organic Induced Organic Conversion Conversion          1
6             Referral Social Ads (Act)            Null       Null       Null          0
7       Special Emails  Induced Organic      Converison Conversion Conversion          1
8          Daily Email      Daily Email     Daily Email       Null       Null          0  

我可以使用嵌套的for循环来完成此操作。

  for (i in 1:nrow(df)){
  for (j in 1:ncol(df)){
    df[i,j] <- ifelse(((is.na(df[i,j])) & df[i,]$conversion == "1"), "Conversion", df[i,j]) 
    for (j in 1:ncol(df)){
      df[i,j] <- ifelse(((is.na(df[i,j])) & df[i,]$conversion == "0"), "Null", df[i,j])
    }   
  }
}

不幸的是,这不能很好地扩展。必须有更好的方法来做到这一点。任何建议将不胜感激。提前致谢!

3 个答案:

答案 0 :(得分:3)

让我们写一个小函数来做一个向量(输入两个向量):

foo = function(x, conversion) {
    x = ifelse(!is.na(x), x, ifelse(conversion == 1, "Conversion", "Null"))
}

然后用一个简单的循环来对除conversion以外的所有列进行操作:

for (col in setdiff(names(df), "conversion")) {
    df[[col]] = foo(df[[col]], df$conversion)
}

df
#                 Node_1           Node_2          Node_3     Node_4     Node_5 conversion
# 1         True Organic     True Organic            Null       Null       Null          0
# 2         True Organic     True Organic            Null       Null       Null          0
# 3 Organic Search (SEO)  Induced Organic            Null       Null       Null          0
# 4              Display             Null            Null       Null       Null          0
# 5     Social Ads (Act)  Induced Organic Induced Organic Conversion Conversion          1
# 6             Referral Social Ads (Act)            Null       Null       Null          0
# 7       Special Emails  Induced Organic      Conversion Conversion Conversion          1
# 8          Daily Email      Daily Email     Daily Email       Null       Null          0

这是一个稍微优化的版本。这可能会在1M +行上节省几秒钟。

foo_x = function(x, conversion) {
    x_na = is.na(x)
    conversion_1 = conversion == 1
    x[x_na & conversion_1] = "Conversion"
    x[x_na & !conversion_1] = "Null"
    return(x)
}

答案 1 :(得分:2)

使用dplyr的另一种选择:

library(dplyr)
df <- df %>% 
  mutate_all(funs(case_when(
    is.na(.) & conversion == 0 ~ "Null", 
    is.na(.) & conversion == 1 ~ "conversion", 
    TRUE ~ .
)))

答案 2 :(得分:1)

使用一个索引分配执行此操作,依赖每个row值的NA索引来对df$conversion列进行分组:

df[is.na(df)] <- c("Null","Conversion")[as.numeric(df$conversion)+1][row(df)[is.na(df)]]

#                Node_1           Node_2          Node_3     Node_4     Node_5 conversion
#1         True Organic     True Organic            Null       Null       Null          0
#2         True Organic     True Organic            Null       Null       Null          0
#3 Organic Search (SEO)  Induced Organic            Null       Null       Null          0
#4              Display             Null            Null       Null       Null          0
#5     Social Ads (Act)  Induced Organic Induced Organic Conversion Conversion          1
#6             Referral Social Ads (Act)            Null       Null       Null          0
#7       Special Emails  Induced Organic      Conversion Conversion Conversion          1
#8          Daily Email      Daily Email     Daily Email       Null       Null          0

执行速度应该快。这是在4.2秒内处理的1.5M行和115列。

df <- df[sample(1:8,1.5e6,replace=TRUE),c(sample(1:5,115,replace=TRUE),6)]
dim(df)
#[1] 1500000     116
system.time({
  df[is.na(df)] <- c("Null","Conversion")[as.numeric(df$conversion)+1][row(df)[is.na(df)]]
})
#   user  system elapsed 
#   2.59    1.61    4.20