Question

我的目标是用最接近的变量分组替换NAs。例如，我们假设有四个变量A，B，C和Num。 Num是数字变量，而A，B和C是绝对的。现在，如果Num = Alpha，A = Beta和B = Theta缺少C的值，那么我想查找其他观察结果组合，计算它们的平均值并替换NA。

如果这种组合不存在，我会寻找A = Alpha和B = Beta组合的观察结果（因此，术语＆＃34;最近的分组＆＃34; ），计算它们的平均值并替换它。

如果这种组合不存在，我会查找归类为A = Alpha的所有观察结果，计算它们的平均值并替换它。

如果这是唯一的观察，那么我们将其替换为0.我已经在我发布的测试文件中创建了这样的场景。

虽然我的代码运行良好，但它非常程序化。我已经从做C / C ++编程转变而且我还不习惯R的矢量化方法。因此，我正在寻找一种方法：

a）清洁（请for循环请更少;内存更少，速度更快）。在编写代码时，我意识到我没有充分利用R编程的强大功能。

b）容易理解。

我在下面的示例输出中添加了注释，仅供参考。

输入数据：

dput(DFile)
structure(list(Region_SL = c("G1", "G1", "G1", "G1", "G2", "G2", 
"G3", "G3", "G3", "G3", "G5", "G5", "G5", "G5", "G5", "G6"), 
    Country_SV = c("United States", "United States", "United States", 
    "United States", "United States", "United States", "United States", 
    NA, NA, NA, "Europe", "UK", "France", "Europe", "Europe", 
    "Australia"), Product_BU = c("Laptop", "Laptop", "Laptop", 
    "Laptop", "Laptop", "Laptop", "Laptop", NA, NA, NA, "Pencil", 
    "Power Cord", "Laptop", "Keyboard", "Mouse", "Motherboard"
    ), Prob_model3 = c(0, 79647405.9878251, 282615405.328728, 
    NA, NA, 363419594.065383, 0, 72870592.8458704, 260045174.088548, 
    369512727.253779, NA, 234, NA, 5, 10, NA)), .Names = c("Region_SL", 
"Country_SV", "Product_BU", "Prob_model3"), row.names = c(NA, 
16L), class = "data.frame")

预期输出 请注意，评论仅供参考。该栏目并不是必需的。

dput(Output)
structure(list(Region_SL = c("G1", "G1", "G1", "G1", "G2", "G2", 
"G3", "G3", "G3", "G3", "G5", "G5", "G5", "G5", "G5", "G6"), 
    Country_SV = c("United States", "United States", "United States", 
    "United States", "United States", "United States", "United States", 
    "United States", "United States", "United States", "Europe", 
    "UK", "France", "Europe", "Europe", "Australia"), Product_BU = c("Laptop", 
    "Laptop", "Laptop", "Laptop", "Laptop", "Laptop", "Laptop", 
    "Laptop", "Laptop", "Laptop", "Pencil", "Power Cord", "Laptop", 
    "Keyboard", "Mouse", "Motherboard"), Prob_model3 = c(0, 79647405.9878251, 
    282615405.328728, 120754270.438851, 363419594.065383, 363419594.065383, 
    0, 72870592.8458704, 260045174.088548, 369512727.253779, 
    7.5, 234, 83, 5, 10, 0), Comment = c(NA, NA, NA, "Grouped on G1, Laptop, US; Average of rows 1 to 3", 
    "Grouped on G2, US, Laptop; Average is the only value in row 6", 
    NA, NA, NA, NA, NA, "Group of G5, Europe and Pencil are unique; G5 and Europe exist. Average of row 14 and 15", 
    NA, "Group of G5, France and Laptop is unique; Group of G5 and France is unique as well; Use group of G5 and take average of row 12, 14, 15", 
    NA, NA, "Unique. Substitute 0")), .Names = c("Region_SL", 
"Country_SV", "Product_BU", "Prob_model3", "Comment"), row.names = c(NA, 
16L), class = "data.frame")

这是我的代码:(代码效果很好，预期输出为b。b只是Output上面没有评论的内容。

DFile_New <-DFile
DFile_New<-DFile_New %>% 
  arrange(Region_SL, Country_SV,Product_BU) 

#replace categorical variable with the combination above or below the row to complete cases.
DFile_New[,1:3]<-  zoo::na.locf(DFile_New[,1:3])

#Create look-up table for means, for each type of combination.
Lookup1<- DFile_New %>%
  dplyr::group_by(Region_SL, Country_SV, Product_BU) %>%
  dplyr::summarise(count=n(),Mean_prob = mean(Prob_model3,na.rm = TRUE)) 

Lookup2<-DFile_New %>%
  dplyr::group_by(Region_SL, Country_SV) %>%
  dplyr::summarise(count=n(),Mean_prob = mean(Prob_model3,na.rm = TRUE)) 

Lookup3<-DFile_New %>%
  dplyr::group_by(Region_SL) %>%
  dplyr::summarise(count=n(),Mean_prob = mean(Prob_model3,na.rm = TRUE)) 

Lookup_Table<-dplyr::bind_rows(Lookup1,Lookup2,Lookup3)

#Get rid of those rows with count = 1
Lookup_Table<-Lookup_Table[!Lookup_Table$count==1,]
colnames(Lookup_Table)[5]<-"Prob_model3"

#Look for combinations based on Region, Country and Product
b<-DFile_New %>%
  dplyr::left_join(Lookup_Table,by=c("Region_SL", "Country_SV", "Product_BU"))
b$Prob_model3 <- coalesce(b$Prob_model3.x,b$Prob_model3.y)
#Drop the two columns
b$Prob_model3.x<-NULL
b$Prob_model3.y<-NULL
b$count<-NULL
b<-b[!(is.na(b$Country_SV)&is.na(b$Product_BU)),]


c<-b[is.na(b$Prob_model3),] %>%
  dplyr::left_join(Lookup_Table[is.na(Lookup_Table$Product_BU) & !is.na(Lookup_Table$Country_SV),],by=c("Region_SL", "Country_SV")) %>%
    dplyr::mutate(Prob_model3 = coalesce(Prob_model3.x,Prob_model3.y)) %>%
    dplyr::select(Region_SL:Product_BU.x, Prob_model3)
colnames(c)[3]<-"Product_BU"
colnames(c)[4]<-"Prob_model3"

b<-rbind(b,c)
b%>% unite(Col,Region_SL:Product_BU,sep=".")
c<-b
b<-b[complete.cases(b[4]),]

#Look for combinations based on Region, and Country     
c<-c[is.na(c$Prob_model3),] %>%
  dplyr::left_join(Lookup_Table[is.na(Lookup_Table$Product_BU) & is.na(Lookup_Table$Country_SV),],by=c("Region_SL")) %>%
  dplyr::filter(!is.na(Prob_model3.y)) %>%
  dplyr::mutate(Prob_model3.1 = coalesce(Prob_model3.x,Prob_model3.y)) %>%
  dplyr::select(Region_SL:Product_BU.x, Prob_model3.1) %>%
  unique(.)
colnames(c)[3]<-"Product_BU"
colnames(c)[2]<-"Country_SV"

#Look for combinations based on Region     
b<-b%>% 
  full_join(c) %>% 
  dplyr::mutate(Prob_model3.2 = coalesce(Prob_model3,Prob_model3.1)) %>%
  dplyr::select(Region_SL:Product_BU,Prob_model3.2)
colnames(b)[4]<-"Prob_model3"

#Are there any unique observations left?
b<-rbind(b,anti_join(DFile_New,b,by=c("Region_SL", "Country_SV", "Product_BU")))
b[is.na(b$Prob_model3),"Prob_model3"]<-0

我对R编程世界相对较新。我真诚地感谢任何帮助。我最好寻找先进的解决方案 - lapply / dplyr / tidyr，一切都很好，只要它不像我的那么复杂。

我的sessionInfo：

R version 3.3.2 (2016-10-31)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)

locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] grDevices datasets  stats     graphics  grid      tcltk     utils     methods   base    

$otherPkgs
 [1] "bit"               "bit64"             "boot"              "car"               "compare"          
 [6] "corrgram"          "corrplot"          "cowplot"           "debug"             "directlabels"     
[11] "dplyr"             "foreign"           "Formula"           "ggplot2"           "ggthemes"         
[16] "gmodels"           "hexbin"            "Hmisc"             "installr"          "knitr"            
[21] "lattice"           "lubridate"         "magrittr"          "maps"              "openxlsx"         
[26] "pastecs"           "plotly"            "plyr"              "psych"             "purrr"            
[31] "R2HTML"            "readr"             "readstata13"       "reshape2"          "ResourceSelection"
[36] "rJava"             "rmarkdown"         "sm"                "stringr"           "survival"         
[41] "tables"            "tibble"            "tidyr"             "tidyverse"         "tufte"            
[46] "tufterhandout"     "vcd"               "xlsxjars"          "xts"               "zoo"              

$loadedOnly
 [1] "acepack"      "assertthat"   "backports"    "base64enc"    "bitops"       "broom"        "caTools"     
 [8] "checkmate"    "class"        "cluster"      "codetools"    "colorspace"   "data.table"   "DBI"         
[15] "dendextend"   "DEoptimR"     "digest"       "diptest"      "evaluate"     "flexmix"      "foreach"     
[22] "fpc"          "gclus"        "gdata"        "gplots"       "gridExtra"    "gtable"       "gtools"      
[29] "haven"        "hms"          "htmlTable"    "htmltools"    "htmlwidgets"  "httr"         "iterators"   
[36] "jsonlite"     "kernlab"      "KernSmooth"   "latticeExtra" "lazyeval"     "lme4"         "lmtest"      
[43] "MASS"         "Matrix"       "MatrixModels" "mclust"       "mgcv"         "minqa"        "mnormt"      
[50] "modelr"       "modeltools"   "munsell"      "mvbutils"     "mvtnorm"      "nlme"         "nloptr"      
[57] "nnet"         "parallel"     "pbkrtest"     "prabclus"     "quadprog"     "quantreg"     "R6"          
[64] "RColorBrewer" "Rcpp"         "readxl"       "registry"     "robustbase"   "rpart"        "rprojroot"   
[71] "rvest"        "scales"       "seriation"    "SparseM"      "splines"      "stats4"       "stringi"     
[78] "tools"        "trimcluster"  "TSP"          "viridisLite"  "whisker"      "xml2"

Answer 1

使用dplyr工具有两步解决方案：

为特定类型的平均值创建表示“查找”的列;
以连续方式替换NA。

以下是代码：

library(dplyr)

df_1 <- df %>%
  group_by(Region_SL) %>%
  summarise(lookup_1 = mean(Prob_model3, na.rm=TRUE))
df_2 <- df %>%
  group_by(Region_SL, Country_SV) %>%
  summarise(lookup_2 = mean(Prob_model3, na.rm=TRUE))
df_3 <- df %>%
  group_by(Region_SL, Country_SV, Product_BU) %>%
  summarise(lookup_3 = mean(Prob_model3, na.rm=TRUE))

df_new <- df %>%
  left_join(df_3, by = c("Region_SL", "Country_SV", "Product_BU")) %>%
  left_join(df_2, by = c("Region_SL", "Country_SV")) %>%
  left_join(df_1, by = c("Region_SL")) %>%
  mutate(modProb_model3 = coalesce(x=Prob_model3,
                                   lookup_3, lookup_2, lookup_1,
                                   0)) %>%
  select(Region_SL, Country_SV, Product_BU, Prob_model3=modProb_model3)

此处df是输入数据框。 df_1，df_2和df_3是具有特定平均信息的数据框（索引表示用于分组的分类变量的数量）。

连续左连接后，使用modProb_model3函数创建新变量coalesce：它在每个位置找到第一个非缺失值。

修改

我认为上述解决方案对特定问题最有效。例如，如果在NA替换中至少有10个可能的分组要考虑，那么一些自动化会更好。此自动化可以如下（使用包tidyverse和lazyeval）：

library(tidyverse) value_name <- "Prob_model3" max_group_vars <- c("Region_SL", "Country_SV", "Product_BU") n_group_vars <- length(max_group_vars) lookup_vars_list <- c(x = value_name, paste0("lookup_", n_group_vars:1)) %>% as.list() get_lookup_table <- function(.data, group_vars, value_name = "Prob_model3", lookup_index = 1) { summarise_data <- (~ mean(val, na.rm = TRUE)) %>% lazyeval::interp(val = as.name(value_name)) %>% list() %>% setNames(paste0("lookup_", lookup_index)) .data %>% group_by_(.dots = as.list(group_vars)) %>% summarise_(.dots = summarise_data) } df_new_1 <- c( list(df), map(n_group_vars:1, function(lookup_index) { get_lookup_table(.data = df, group_vars = max_group_vars[1:lookup_index], value_name = value_name, lookup_index = lookup_index) }) ) %>% reduce(left_join) %>% mutate(modValue = select_(., .dots = lookup_vars_list) %>% as.list() %>% c(0) %>% do.call(what = coalesce)) %>% select(-matches(match = paste0("^lookup_[0-9]+$|", value_name))) %>% rename_(.dots = setNames(list("modValue"), value_name))

基本上算法是相同的但代码更通用：它根据名称NA的值取代value_name列中的max_group_vars值，这些值是通过减少列名称集合定义的组中的平均值（从存储的集合开始）在dplyr）。代码严重使用get_lookup_table的标准评估（请参阅https://cran.r-project.org/web/packages/dplyr/vignettes/nse.html）。以下是一些解释：

lookup_i为指定的分组变量创建查找表。查阅列具有唯一名称i，其中reduce - 指定查找表索引;

连续左连接是使用left_join函数完成的，该函数连续将一个带有两个参数的函数（此处为x y和by）应用于其先前的调用和一个新的值。例如：f1 = f（x1，x2） - ＆gt; f2 = f（f1，x3），依此类推。注意左连接正确完成而未指定dplyr参数，因为查找表是以“自然连接”正确的方式创建的。关于猜测加入列的行也会有modValue警告;

do.call是使用函数coalesce创建的，并为modValue创建了初步参数列表;

管道中的最后两个元素：使用正则表达式选择适当的列，然后将value_name重命名为所需的if segue.identifier == "unwindToStartVC"{ let vc = segue.destination as! StartVC vc.topBarReference?.CategoryFilterButton.titleLabel?.text = ((sender as! UIButton).titleLabel?.text)! }。

根据表中最接近的列分组的平均值替换NA的值

1 个答案: