使用基于分组的其他表替换NA(不是逐个查找表)

时间:2017-01-20 07:18:10

标签: r dplyr

我的目标是用另一个查找表中的值替换一个表中的值。有一个问题:这个查找表不是Replace na's with value from another df中讨论的逐个查找表,但查找将基于多个列分组完成。因此,如果根据查找表中的这些分组返回多个条目,则需要在原始表中填充所有条目。

我能够完成这项任务,但我需要两件事情的帮助:

a)我的代码非常混乱。每次我都要做类似的事情,我最终会花费大量的时间来弄清楚我做了什么,然后重新使用它。所以,我会感激任何更干净,更简单的东西。

b)这很慢。我有多个ifelse语句。当我使用36M记录在实际数据上运行时,需要花费很多时间。

这是我的虚拟数据来源:

dput(DFile)
structure(list(Region_SL = c("G1", "G1", "G1", "G1", "G2", "G2", 
"G3", "G3", "G3", "G3", "G4", "G4", "G4", "G4", "G5", "G5"), 
    Country_SV = c("United States", "United States", "United States", 
    "United States", "United States", "United States", "United States", 
    "United States", "United States", "United States", "United States", 
    "United States", "United States", "United States", "UK", 
    "UK"), Product_BU = c("Laptop", "Laptop", "Laptop", "Laptop", 
    "Laptop", "Laptop", "Laptop", "Laptop", "Laptop", "Laptop", 
    "Laptop", "Laptop", "Laptop", "Laptop", "Power Cord", "Laptop"
    ), Prob_model3 = c(0, 79647405.9878251, 282615405.328728, 
    NA, NA, 363419594.065383, 0, 72870592.8458704, 260045174.088548, 
    369512727.253779, 0, 79906001.2878251, 285128278.558728, 
    405490639.873629, 234, NA), DoS.FY = c(2014, 2013, 2012, 
    NA, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 
    2015, 2016, NA), Insured = c("Covered", "Covered", "Covered", 
    NA, NA, "Not Covered", "Not Covered", "Not Covered", "Not Covered", 
    "Not Covered", "Not Covered", "Not Covered", "Not Covered", 
    "Not Covered", "Covered", NA)), .Names = c("Region_SL", "Country_SV", 
"Product_BU", "Prob_model3", "DoS.FY", "Insured"), row.names = c(NA, 
16L), class = "data.frame")

这是我的分组查找表:

dput(Master_Joined)
structure(list(Region_SL = c("G1", "G1", "G1", "G1", "G2", "G3", 
"G4", "G5", "G5", "G5"), Country_SV = c("United States", "United States", 
"United States", "United States", "United States", "United States", 
"United States", "UK", "UK", "UK"), Product_BU = c("Laptop", 
"Laptop", "Laptop", "Laptop", "Laptop", "Laptop", "Laptop", "Power Cord", 
"Laptop", "Laptop"), DoS.FY = c(2014, 2013, 2012, 2015, 2015, 
2015, 2015, 2016, 2017, 2017), Insured = c("Covered", "Covered", 
"Covered", "Uncovered", "Not Covered", "Not Covered", "Not Covered", 
"Covered", "Uncovered", "Covered")), .Names = c("Region_SL", 
"Country_SV", "Product_BU", "DoS.FY", "Insured"), row.names = c(NA, 
10L), class = "data.frame")

在所有条目都是唯一的意义上,这是“分组”。

最后,这是我的代码:

#Which fields are missing?
Missing<-DFile[is.na(DFile$Prob_model3),]

Column_name<-colnames(DFile)[4]
colnames(DFile)[4]<-"temp_prob"

#Replace Prob_model3
DFile<-DFile %>%
  group_by(Region_SL, Country_SV, Product_BU) %>%
  dplyr::mutate(Average_Value = mean(temp_prob,na.rm = TRUE)) %>%
  rowwise() %>%
  dplyr::mutate(Col_name1 = ifelse(is.na(temp_prob),Average_Value,temp_prob)) %>%
  dplyr::select(Region_SL:Product_BU,DoS.FY,Insured,Col_name1)

colnames(DFile)[6]<-Column_name

  Missing$DoS.FY<-NULL

  Missing_FYear<-Missing %>% 
    inner_join(Master_Joined,by = c("Region_SL", "Country_SV", "Product_BU")) %>%
    group_by(Region_SL, Country_SV, Product_BU, DoS.FY, Insured.y) %>%
    dplyr::distinct() %>%
    left_join(Missing)

  Missing_FYear$Prob_model3<-NULL

  DFile <-DFile %>% 
    left_join(Missing_FYear,by = c("Region_SL", "Country_SV", "Product_BU", "Insured")) %>%
    dplyr::rowwise() %>%
    mutate(DoS.FY=ifelse((is.na(`DoS.FY.y`)|is.na(`DoS.FY.x`)),sum(`DoS.FY.y`,`DoS.FY.x`,na.rm=TRUE),`DoS.FY.x`), Insured_Combined = ifelse(is.na(Insured),Insured.y,Insured)) %>%
    dplyr::select(Region_SL:Product_BU,Prob_model3,DoS.FY, Insured_Combined)  

  colnames(DFile)[6]<-"Insured"
  #Check again
  Missing<-DFile[is.na(DFile$Prob_model3),] 

  if (nrow(Missing) > 1)
  { #you have NaNs, replace them with 0
    DFile[is.nan(DFile$Prob_model3),"Prob_model3"] <- 0
   }
  Missing<-DFile[is.na(DFile$Prob_model3),] 

预期输出:DFile与运行上述代码后一样。

我真诚地感谢你的帮助。我一直在努力解决这个问题大约一个星期。

2 个答案:

答案 0 :(得分:2)

一个想法是找到Region_SL NA。完成后,我们使用plyr rbind.fill来绑定new_df。然后,我们使用NA过滤掉所有行(最后一列 - 第6列除外)。我们创建了一个新变量Prob_model4,它保存每组Region_SL的均值。然后我们使用coalesce来&#34;合并&#34;两列。

library(dplyr)
ind <- unique(as.integer(which(is.na(DFile), arr.ind = TRUE)[,1]))
new_df <- plyr::rbind.fill(Master_joined[Master_joined$Region_SL %in% DFile$Region_SL[ind],], DFile)

new_df %>% 
  arrange(Region_SL, Prob_model3) %>% 
  filter(complete.cases(.[-6])) %>% 
  group_by(Region_SL) %>% 
  mutate(Prob_model3 = replace(Prob_model3, is.na(Prob_model3), mean(Prob_model3, na.rm = T))) %>%  
  ungroup()

# A tibble: 21 × 6
#   Region_SL    Country_SV Product_BU DoS.FY     Insured Prob_model3
#       <chr>         <chr>      <chr>  <dbl>       <chr>       <dbl>
#1         G1 United States     Laptop   2014     Covered           0
#2         G1 United States     Laptop   2013     Covered    79647406
#3         G1 United States     Laptop   2012     Covered   282615405
#4         G1 United States     Laptop   2014     Covered   120754270
#5         G1 United States     Laptop   2013     Covered   120754270
#6         G1 United States     Laptop   2012     Covered   120754270
#7         G1 United States     Laptop   2015   Uncovered   120754270
#8         G2 United States     Laptop   2015 Not Covered   363419594
#9         G2 United States     Laptop   2015 Not Covered   363419594
#10        G3 United States     Laptop   2015 Not Covered           0
# ... with 11 more rows

答案 1 :(得分:0)

考虑这一点的另一种方法是仅将DoS.FY或Insured中缺少值的行与主数据合并:

#Replace missing probabilities by grouped average
DFile_new <- DFile %>% group_by(Region_SL,Country_SV,Product_BU) %>% mutate(Prob_model3 = coalesce(Prob_model3,mean(Prob_model3, na.rm = T))) %>% ungroup()
#This leads to one NaN because for
# 16        G5            UK     Laptop          NA     NA        <NA>
#there are no other rows in the same group
DFile_new$Prob_model3[is.nan(DFile_new$Prob_model3)] <- 0

#Split dataset into two parts
#1) The part that has no NA's in DoS.FY and Insured
DFile_new1 <- filter(DFile_new,!is.na(DoS.FY) & !is.na(Insured))
#2) The part has NA's in either DoS.FY or Insured
DFile_new2 <- filter(DFile_new,is.na(DoS.FY) | is.na(Insured))

#merge DFile_new2 and Master_Joined
DFile_new2 <- merge(DFile_new2,Master_Joined,by=c("Region_SL","Country_SV","Product_BU")) %>%
  mutate(DoS.FY.x = coalesce(DoS.FY.x,DoS.FY.y), Insured.x = coalesce(Insured.x,Insured.y)) %>%
  select(-Insured.y,-DoS.FY.y) %>% rename(Insured=Insured.x, DoS.FY = DoS.FY.x)

#Put all rows in frame
my_out_new <- rbind(DFile_new1,DFile_new2)

这产生与OP代码相同的结果(尽管顺序不同):

> compare <- function(df1,df2) {
+   idx1 <- c()
+   idx2 <- c()
+   for(i in 1:nrow(df1)) {
+     found <- FALSE
+     for(j in 1:nrow(df2)) {
+       if(!(j %in% idx2)) {
+         idx = as.logical(df1[i,] != df2[j,])
+         d <- suppressWarnings(abs(as.numeric(df1[i,idx])-as.numeric(df2[j,idx]))) < 1e-5
+         if(!(any(is.na(d))) & all(d)) {
+           idx1 <- c(idx1,i)
+           idx2 <- c(idx2,j)
+           break;
+         }
+       }
+     }
+   }
+   rbind(idx1,idx2)
+ }
> compare(my_out,my_out_new)
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
idx1    1    2    3    4    5    6    7    8    9    10    11    12    13    14    15    16    17
idx2    1    2    3   14   15   16   17    4   18     5     6     7     8     9    10    11    12
     [,18] [,19] [,20]
idx1    18    19    20
idx2    13    19    20

(其中my_out是OP代码的最终DFile)