我的目标是用另一个查找表中的值替换一个表中的值。有一个问题:这个查找表不是Replace na's with value from another df中讨论的逐个查找表,但查找将基于多个列分组完成。因此,如果根据查找表中的这些分组返回多个条目,则需要在原始表中填充所有条目。
我能够完成这项任务,但我需要两件事情的帮助:
a)我的代码非常混乱。每次我都要做类似的事情,我最终会花费大量的时间来弄清楚我做了什么,然后重新使用它。所以,我会感激任何更干净,更简单的东西。
b)这很慢。我有多个ifelse
语句。当我使用36M记录在实际数据上运行时,需要花费很多时间。
这是我的虚拟数据来源:
dput(DFile)
structure(list(Region_SL = c("G1", "G1", "G1", "G1", "G2", "G2",
"G3", "G3", "G3", "G3", "G4", "G4", "G4", "G4", "G5", "G5"),
Country_SV = c("United States", "United States", "United States",
"United States", "United States", "United States", "United States",
"United States", "United States", "United States", "United States",
"United States", "United States", "United States", "UK",
"UK"), Product_BU = c("Laptop", "Laptop", "Laptop", "Laptop",
"Laptop", "Laptop", "Laptop", "Laptop", "Laptop", "Laptop",
"Laptop", "Laptop", "Laptop", "Laptop", "Power Cord", "Laptop"
), Prob_model3 = c(0, 79647405.9878251, 282615405.328728,
NA, NA, 363419594.065383, 0, 72870592.8458704, 260045174.088548,
369512727.253779, 0, 79906001.2878251, 285128278.558728,
405490639.873629, 234, NA), DoS.FY = c(2014, 2013, 2012,
NA, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2016, NA), Insured = c("Covered", "Covered", "Covered",
NA, NA, "Not Covered", "Not Covered", "Not Covered", "Not Covered",
"Not Covered", "Not Covered", "Not Covered", "Not Covered",
"Not Covered", "Covered", NA)), .Names = c("Region_SL", "Country_SV",
"Product_BU", "Prob_model3", "DoS.FY", "Insured"), row.names = c(NA,
16L), class = "data.frame")
这是我的分组查找表:
dput(Master_Joined)
structure(list(Region_SL = c("G1", "G1", "G1", "G1", "G2", "G3",
"G4", "G5", "G5", "G5"), Country_SV = c("United States", "United States",
"United States", "United States", "United States", "United States",
"United States", "UK", "UK", "UK"), Product_BU = c("Laptop",
"Laptop", "Laptop", "Laptop", "Laptop", "Laptop", "Laptop", "Power Cord",
"Laptop", "Laptop"), DoS.FY = c(2014, 2013, 2012, 2015, 2015,
2015, 2015, 2016, 2017, 2017), Insured = c("Covered", "Covered",
"Covered", "Uncovered", "Not Covered", "Not Covered", "Not Covered",
"Covered", "Uncovered", "Covered")), .Names = c("Region_SL",
"Country_SV", "Product_BU", "DoS.FY", "Insured"), row.names = c(NA,
10L), class = "data.frame")
在所有条目都是唯一的意义上,这是“分组”。
最后,这是我的代码:
#Which fields are missing?
Missing<-DFile[is.na(DFile$Prob_model3),]
Column_name<-colnames(DFile)[4]
colnames(DFile)[4]<-"temp_prob"
#Replace Prob_model3
DFile<-DFile %>%
group_by(Region_SL, Country_SV, Product_BU) %>%
dplyr::mutate(Average_Value = mean(temp_prob,na.rm = TRUE)) %>%
rowwise() %>%
dplyr::mutate(Col_name1 = ifelse(is.na(temp_prob),Average_Value,temp_prob)) %>%
dplyr::select(Region_SL:Product_BU,DoS.FY,Insured,Col_name1)
colnames(DFile)[6]<-Column_name
Missing$DoS.FY<-NULL
Missing_FYear<-Missing %>%
inner_join(Master_Joined,by = c("Region_SL", "Country_SV", "Product_BU")) %>%
group_by(Region_SL, Country_SV, Product_BU, DoS.FY, Insured.y) %>%
dplyr::distinct() %>%
left_join(Missing)
Missing_FYear$Prob_model3<-NULL
DFile <-DFile %>%
left_join(Missing_FYear,by = c("Region_SL", "Country_SV", "Product_BU", "Insured")) %>%
dplyr::rowwise() %>%
mutate(DoS.FY=ifelse((is.na(`DoS.FY.y`)|is.na(`DoS.FY.x`)),sum(`DoS.FY.y`,`DoS.FY.x`,na.rm=TRUE),`DoS.FY.x`), Insured_Combined = ifelse(is.na(Insured),Insured.y,Insured)) %>%
dplyr::select(Region_SL:Product_BU,Prob_model3,DoS.FY, Insured_Combined)
colnames(DFile)[6]<-"Insured"
#Check again
Missing<-DFile[is.na(DFile$Prob_model3),]
if (nrow(Missing) > 1)
{ #you have NaNs, replace them with 0
DFile[is.nan(DFile$Prob_model3),"Prob_model3"] <- 0
}
Missing<-DFile[is.na(DFile$Prob_model3),]
预期输出:DFile
与运行上述代码后一样。
我真诚地感谢你的帮助。我一直在努力解决这个问题大约一个星期。
答案 0 :(得分:2)
一个想法是找到Region_SL
NA
。完成后,我们使用plyr
rbind.fill
来绑定new_df
。然后,我们使用NA
过滤掉所有行(最后一列 - 第6列除外)。我们创建了一个新变量Prob_model4
,它保存每组Region_SL
的均值。然后我们使用coalesce
来&#34;合并&#34;两列。
library(dplyr)
ind <- unique(as.integer(which(is.na(DFile), arr.ind = TRUE)[,1]))
new_df <- plyr::rbind.fill(Master_joined[Master_joined$Region_SL %in% DFile$Region_SL[ind],], DFile)
new_df %>%
arrange(Region_SL, Prob_model3) %>%
filter(complete.cases(.[-6])) %>%
group_by(Region_SL) %>%
mutate(Prob_model3 = replace(Prob_model3, is.na(Prob_model3), mean(Prob_model3, na.rm = T))) %>%
ungroup()
# A tibble: 21 × 6
# Region_SL Country_SV Product_BU DoS.FY Insured Prob_model3
# <chr> <chr> <chr> <dbl> <chr> <dbl>
#1 G1 United States Laptop 2014 Covered 0
#2 G1 United States Laptop 2013 Covered 79647406
#3 G1 United States Laptop 2012 Covered 282615405
#4 G1 United States Laptop 2014 Covered 120754270
#5 G1 United States Laptop 2013 Covered 120754270
#6 G1 United States Laptop 2012 Covered 120754270
#7 G1 United States Laptop 2015 Uncovered 120754270
#8 G2 United States Laptop 2015 Not Covered 363419594
#9 G2 United States Laptop 2015 Not Covered 363419594
#10 G3 United States Laptop 2015 Not Covered 0
# ... with 11 more rows
答案 1 :(得分:0)
考虑这一点的另一种方法是仅将DoS.FY或Insured中缺少值的行与主数据合并:
#Replace missing probabilities by grouped average
DFile_new <- DFile %>% group_by(Region_SL,Country_SV,Product_BU) %>% mutate(Prob_model3 = coalesce(Prob_model3,mean(Prob_model3, na.rm = T))) %>% ungroup()
#This leads to one NaN because for
# 16 G5 UK Laptop NA NA <NA>
#there are no other rows in the same group
DFile_new$Prob_model3[is.nan(DFile_new$Prob_model3)] <- 0
#Split dataset into two parts
#1) The part that has no NA's in DoS.FY and Insured
DFile_new1 <- filter(DFile_new,!is.na(DoS.FY) & !is.na(Insured))
#2) The part has NA's in either DoS.FY or Insured
DFile_new2 <- filter(DFile_new,is.na(DoS.FY) | is.na(Insured))
#merge DFile_new2 and Master_Joined
DFile_new2 <- merge(DFile_new2,Master_Joined,by=c("Region_SL","Country_SV","Product_BU")) %>%
mutate(DoS.FY.x = coalesce(DoS.FY.x,DoS.FY.y), Insured.x = coalesce(Insured.x,Insured.y)) %>%
select(-Insured.y,-DoS.FY.y) %>% rename(Insured=Insured.x, DoS.FY = DoS.FY.x)
#Put all rows in frame
my_out_new <- rbind(DFile_new1,DFile_new2)
这产生与OP代码相同的结果(尽管顺序不同):
> compare <- function(df1,df2) {
+ idx1 <- c()
+ idx2 <- c()
+ for(i in 1:nrow(df1)) {
+ found <- FALSE
+ for(j in 1:nrow(df2)) {
+ if(!(j %in% idx2)) {
+ idx = as.logical(df1[i,] != df2[j,])
+ d <- suppressWarnings(abs(as.numeric(df1[i,idx])-as.numeric(df2[j,idx]))) < 1e-5
+ if(!(any(is.na(d))) & all(d)) {
+ idx1 <- c(idx1,i)
+ idx2 <- c(idx2,j)
+ break;
+ }
+ }
+ }
+ }
+ rbind(idx1,idx2)
+ }
> compare(my_out,my_out_new)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
idx1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
idx2 1 2 3 14 15 16 17 4 18 5 6 7 8 9 10 11 12
[,18] [,19] [,20]
idx1 18 19 20
idx2 13 19 20
(其中my_out是OP代码的最终DFile)