根据表中最接近的列分组的平均值替换NA的值

时间:2017-01-21 08:00:15

标签: r dplyr tidyr

我的目标是用最接近的变量分组替换NAs。例如,我们假设有四个变量ABCNumNum是数字变量,而ABC是绝对的。现在,如果Num = Alpha,A = Beta和B = Theta缺少C的值,那么我想查找其他观察结果组合,计算它们的平均值并替换NA。

如果这种组合不存在,我会寻找A = Alpha和B = Beta组合的观察结果(因此,术语"最近的分组" ),计算它们的平均值并替换它。

如果这种组合不存在,我会查找归类为A = Alpha的所有观察结果,计算它们的平均值并替换它。

如果这是唯一的观察,那么我们将其替换为0.我已经在我发布的测试文件中创建了这样的场景。

虽然我的代码运行良好,但它非常程序化。我已经从做C / C ++编程转变而且我还不习惯R的矢量化方法。因此,我正在寻找一种方法:

a)清洁(请for循环请更少;内存更少,速度更快)。在编写代码时,我意识到我没有充分利用R编程的强大功能。

b)容易理解。

我在下面的示例输出中添加了注释,仅供参考。

输入数据:

dput(DFile)
structure(list(Region_SL = c("G1", "G1", "G1", "G1", "G2", "G2", 
"G3", "G3", "G3", "G3", "G5", "G5", "G5", "G5", "G5", "G6"), 
    Country_SV = c("United States", "United States", "United States", 
    "United States", "United States", "United States", "United States", 
    NA, NA, NA, "Europe", "UK", "France", "Europe", "Europe", 
    "Australia"), Product_BU = c("Laptop", "Laptop", "Laptop", 
    "Laptop", "Laptop", "Laptop", "Laptop", NA, NA, NA, "Pencil", 
    "Power Cord", "Laptop", "Keyboard", "Mouse", "Motherboard"
    ), Prob_model3 = c(0, 79647405.9878251, 282615405.328728, 
    NA, NA, 363419594.065383, 0, 72870592.8458704, 260045174.088548, 
    369512727.253779, NA, 234, NA, 5, 10, NA)), .Names = c("Region_SL", 
"Country_SV", "Product_BU", "Prob_model3"), row.names = c(NA, 
16L), class = "data.frame")

预期输出 请注意,评论仅供参考。该栏目并不是必需的。

dput(Output)
structure(list(Region_SL = c("G1", "G1", "G1", "G1", "G2", "G2", 
"G3", "G3", "G3", "G3", "G5", "G5", "G5", "G5", "G5", "G6"), 
    Country_SV = c("United States", "United States", "United States", 
    "United States", "United States", "United States", "United States", 
    "United States", "United States", "United States", "Europe", 
    "UK", "France", "Europe", "Europe", "Australia"), Product_BU = c("Laptop", 
    "Laptop", "Laptop", "Laptop", "Laptop", "Laptop", "Laptop", 
    "Laptop", "Laptop", "Laptop", "Pencil", "Power Cord", "Laptop", 
    "Keyboard", "Mouse", "Motherboard"), Prob_model3 = c(0, 79647405.9878251, 
    282615405.328728, 120754270.438851, 363419594.065383, 363419594.065383, 
    0, 72870592.8458704, 260045174.088548, 369512727.253779, 
    7.5, 234, 83, 5, 10, 0), Comment = c(NA, NA, NA, "Grouped on G1, Laptop, US; Average of rows 1 to 3", 
    "Grouped on G2, US, Laptop; Average is the only value in row 6", 
    NA, NA, NA, NA, NA, "Group of G5, Europe and Pencil are unique; G5 and Europe exist. Average of row 14 and 15", 
    NA, "Group of G5, France and Laptop is unique; Group of G5 and France is unique as well; Use group of G5 and take average of row 12, 14, 15", 
    NA, NA, "Unique. Substitute 0")), .Names = c("Region_SL", 
"Country_SV", "Product_BU", "Prob_model3", "Comment"), row.names = c(NA, 
16L), class = "data.frame")

这是我的代码:(代码效果很好,预期输出为bb只是Output上面没有评论的内容。

DFile_New <-DFile
DFile_New<-DFile_New %>% 
  arrange(Region_SL, Country_SV,Product_BU) 

#replace categorical variable with the combination above or below the row to complete cases.
DFile_New[,1:3]<-  zoo::na.locf(DFile_New[,1:3])

#Create look-up table for means, for each type of combination.
Lookup1<- DFile_New %>%
  dplyr::group_by(Region_SL, Country_SV, Product_BU) %>%
  dplyr::summarise(count=n(),Mean_prob = mean(Prob_model3,na.rm = TRUE)) 

Lookup2<-DFile_New %>%
  dplyr::group_by(Region_SL, Country_SV) %>%
  dplyr::summarise(count=n(),Mean_prob = mean(Prob_model3,na.rm = TRUE)) 

Lookup3<-DFile_New %>%
  dplyr::group_by(Region_SL) %>%
  dplyr::summarise(count=n(),Mean_prob = mean(Prob_model3,na.rm = TRUE)) 

Lookup_Table<-dplyr::bind_rows(Lookup1,Lookup2,Lookup3)

#Get rid of those rows with count = 1
Lookup_Table<-Lookup_Table[!Lookup_Table$count==1,]
colnames(Lookup_Table)[5]<-"Prob_model3"

#Look for combinations based on Region, Country and Product
b<-DFile_New %>%
  dplyr::left_join(Lookup_Table,by=c("Region_SL", "Country_SV", "Product_BU"))
b$Prob_model3 <- coalesce(b$Prob_model3.x,b$Prob_model3.y)
#Drop the two columns
b$Prob_model3.x<-NULL
b$Prob_model3.y<-NULL
b$count<-NULL
b<-b[!(is.na(b$Country_SV)&is.na(b$Product_BU)),]


c<-b[is.na(b$Prob_model3),] %>%
  dplyr::left_join(Lookup_Table[is.na(Lookup_Table$Product_BU) & !is.na(Lookup_Table$Country_SV),],by=c("Region_SL", "Country_SV")) %>%
    dplyr::mutate(Prob_model3 = coalesce(Prob_model3.x,Prob_model3.y)) %>%
    dplyr::select(Region_SL:Product_BU.x, Prob_model3)
colnames(c)[3]<-"Product_BU"
colnames(c)[4]<-"Prob_model3"

b<-rbind(b,c)
b%>% unite(Col,Region_SL:Product_BU,sep=".")
c<-b
b<-b[complete.cases(b[4]),]

#Look for combinations based on Region, and Country     
c<-c[is.na(c$Prob_model3),] %>%
  dplyr::left_join(Lookup_Table[is.na(Lookup_Table$Product_BU) & is.na(Lookup_Table$Country_SV),],by=c("Region_SL")) %>%
  dplyr::filter(!is.na(Prob_model3.y)) %>%
  dplyr::mutate(Prob_model3.1 = coalesce(Prob_model3.x,Prob_model3.y)) %>%
  dplyr::select(Region_SL:Product_BU.x, Prob_model3.1) %>%
  unique(.)
colnames(c)[3]<-"Product_BU"
colnames(c)[2]<-"Country_SV"

#Look for combinations based on Region     
b<-b%>% 
  full_join(c) %>% 
  dplyr::mutate(Prob_model3.2 = coalesce(Prob_model3,Prob_model3.1)) %>%
  dplyr::select(Region_SL:Product_BU,Prob_model3.2)
colnames(b)[4]<-"Prob_model3"

#Are there any unique observations left?
b<-rbind(b,anti_join(DFile_New,b,by=c("Region_SL", "Country_SV", "Product_BU")))
b[is.na(b$Prob_model3),"Prob_model3"]<-0

我对R编程世界相对较新。我真诚地感谢任何帮助。 我最好寻找先进的解决方案 - lapply / dplyr / tidyr,一切都很好,只要它不像我的那么复杂。

我的sessionInfo:

R version 3.3.2 (2016-10-31)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)

locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] grDevices datasets  stats     graphics  grid      tcltk     utils     methods   base    

$otherPkgs
 [1] "bit"               "bit64"             "boot"              "car"               "compare"          
 [6] "corrgram"          "corrplot"          "cowplot"           "debug"             "directlabels"     
[11] "dplyr"             "foreign"           "Formula"           "ggplot2"           "ggthemes"         
[16] "gmodels"           "hexbin"            "Hmisc"             "installr"          "knitr"            
[21] "lattice"           "lubridate"         "magrittr"          "maps"              "openxlsx"         
[26] "pastecs"           "plotly"            "plyr"              "psych"             "purrr"            
[31] "R2HTML"            "readr"             "readstata13"       "reshape2"          "ResourceSelection"
[36] "rJava"             "rmarkdown"         "sm"                "stringr"           "survival"         
[41] "tables"            "tibble"            "tidyr"             "tidyverse"         "tufte"            
[46] "tufterhandout"     "vcd"               "xlsxjars"          "xts"               "zoo"              

$loadedOnly
 [1] "acepack"      "assertthat"   "backports"    "base64enc"    "bitops"       "broom"        "caTools"     
 [8] "checkmate"    "class"        "cluster"      "codetools"    "colorspace"   "data.table"   "DBI"         
[15] "dendextend"   "DEoptimR"     "digest"       "diptest"      "evaluate"     "flexmix"      "foreach"     
[22] "fpc"          "gclus"        "gdata"        "gplots"       "gridExtra"    "gtable"       "gtools"      
[29] "haven"        "hms"          "htmlTable"    "htmltools"    "htmlwidgets"  "httr"         "iterators"   
[36] "jsonlite"     "kernlab"      "KernSmooth"   "latticeExtra" "lazyeval"     "lme4"         "lmtest"      
[43] "MASS"         "Matrix"       "MatrixModels" "mclust"       "mgcv"         "minqa"        "mnormt"      
[50] "modelr"       "modeltools"   "munsell"      "mvbutils"     "mvtnorm"      "nlme"         "nloptr"      
[57] "nnet"         "parallel"     "pbkrtest"     "prabclus"     "quadprog"     "quantreg"     "R6"          
[64] "RColorBrewer" "Rcpp"         "readxl"       "registry"     "robustbase"   "rpart"        "rprojroot"   
[71] "rvest"        "scales"       "seriation"    "SparseM"      "splines"      "stats4"       "stringi"     
[78] "tools"        "trimcluster"  "TSP"          "viridisLite"  "whisker"      "xml2"   

1 个答案:

答案 0 :(得分:1)

使用dplyr工具有两步解决方案:

  1. 为特定类型的平均值创建表示“查找”的列;
  2. 以连续方式替换NA
  3. 以下是代码:

    library(dplyr)
    
    df_1 <- df %>%
      group_by(Region_SL) %>%
      summarise(lookup_1 = mean(Prob_model3, na.rm=TRUE))
    df_2 <- df %>%
      group_by(Region_SL, Country_SV) %>%
      summarise(lookup_2 = mean(Prob_model3, na.rm=TRUE))
    df_3 <- df %>%
      group_by(Region_SL, Country_SV, Product_BU) %>%
      summarise(lookup_3 = mean(Prob_model3, na.rm=TRUE))
    
    df_new <- df %>%
      left_join(df_3, by = c("Region_SL", "Country_SV", "Product_BU")) %>%
      left_join(df_2, by = c("Region_SL", "Country_SV")) %>%
      left_join(df_1, by = c("Region_SL")) %>%
      mutate(modProb_model3 = coalesce(x=Prob_model3,
                                       lookup_3, lookup_2, lookup_1,
                                       0)) %>%
      select(Region_SL, Country_SV, Product_BU, Prob_model3=modProb_model3)
    

    此处df是输入数据框。 df_1df_2df_3是具有特定平均信息的数据框(索引表示用于分组的分类变量的数量)。

    连续左连接后,使用modProb_model3函数创建新变量coalesce:它在每个位置找到第一个非缺失值。

    修改

    我认为上述解决方案对特定问题最有效。例如,如果在NA替换中至少有10个可能的分组要考虑,那么一些自动化会更好。此自动化可以如下(使用包tidyverselazyeval):

    library(tidyverse)
    
    value_name <- "Prob_model3"
    max_group_vars <- c("Region_SL", "Country_SV", "Product_BU")
    n_group_vars <- length(max_group_vars)
    lookup_vars_list <- c(x = value_name, paste0("lookup_", n_group_vars:1)) %>%
      as.list()
    
    get_lookup_table <- function(.data,
                                 group_vars,
                                 value_name = "Prob_model3",
                                 lookup_index = 1) {
      summarise_data <- (~ mean(val, na.rm = TRUE)) %>%
        lazyeval::interp(val = as.name(value_name)) %>%
        list() %>%
        setNames(paste0("lookup_", lookup_index))
      .data %>%
        group_by_(.dots = as.list(group_vars)) %>%
        summarise_(.dots = summarise_data)
    }
    
    df_new_1 <- c(
      list(df),
      map(n_group_vars:1, function(lookup_index) {
        get_lookup_table(.data = df,
                         group_vars = max_group_vars[1:lookup_index],
                         value_name = value_name,
                         lookup_index = lookup_index)
      })
    ) %>%
      reduce(left_join) %>%
      mutate(modValue = select_(., .dots = lookup_vars_list) %>%
               as.list() %>%
               c(0) %>%
               do.call(what = coalesce)) %>%
      select(-matches(match = paste0("^lookup_[0-9]+$|", value_name))) %>%
      rename_(.dots = setNames(list("modValue"), value_name))
    

    基本上算法是相同的但代码更通用:它根据名称NA的值取代value_name列中的max_group_vars值,这些值是通过减少列名称集合定义的组中的平均值(从存储的集合开始)在dplyr)。代码严重使用get_lookup_table的标准评估(请参阅https://cran.r-project.org/web/packages/dplyr/vignettes/nse.html)。以下是一些解释:

    1. lookup_i为指定的分组变量创建查找表。查阅列具有唯一名称i,其中reduce - 指定查找表索引;
    2. 连续左连接是使用left_join函数完成的,该函数连续将一个带有两个参数的函数(此处为x yby)应用于其先前的调用和一个新的值。例如:f1 = f(x1,x2) - &gt; f2 = f(f1,x3),依此类推。 注意左连接正确完成而未指定dplyr参数,因为查找表是以“自然连接”正确的方式创建的。关于猜测加入列的行也会有modValue警告;
    3. do.call是使用函数coalesce创建的,并为modValue创建了初步参数列表;
    4. 管道中的最后两个元素:使用正则表达式选择适当的列,然后将value_name重命名为所需的 if segue.identifier == "unwindToStartVC"{ let vc = segue.destination as! StartVC vc.topBarReference?.CategoryFilterButton.titleLabel?.text = ((sender as! UIButton).titleLabel?.text)! }