基于来自不同DataFrame的分组值更新DataFrame

时间:2017-02-26 09:12:11

标签: r data.table dplyr

我有一个数据框,其列需要根据查找表进行更新。查找表基于分组集。如果找不到匹配项,则要更新的值将保留为空白。

以下是我的输入数据:

dput(DF_Generated)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234", 
"P1234", "P1234", "P1234", "P2345", "P2345", "P2345", "P3456", 
"P3456", NA, NA), SO_ID = c("S1", "S1", "S1", "S2", "S2", "S2", 
"S3", "S3", "S4", "S5", "S7", NA, "S10", "S10"), F_Year = c(2012, 
2012, 2012, 2013, 2013, 2013, 2013, 2011, 2011, 2012, 2014, 2014, 
2015, 2015), Product_ID = c("385X", "385X", "385X", "450X", "450X", 
"900X", "N9X", "3700", "3700", "3800", "A11U", "385X", "2700", 
"3700"), Revenue = c(16.6666666666667, 16.6666666666667, 16.6666666666667, 
35, 35, 35, 100, 100, -50, 20, 50, 20, 100, 10), Quantity = c(1, 
1, 1, 10, 10, 20, 20, 20, -10, 20, 20, 5, 40, 20), Location1 = c("MA", 
"NY", "WA", "NY", "WA", "NY", NA, "IL", "IL", NA, "MN", NA, "CA", 
NA), Name = c("N1", "N1", "N1", "N1", "N1", "N1", NA, "N2", "N2", 
NA, "N3", NA, "N4", NA)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L), .Names = c("PO_ID", "SO_ID", "F_Year", 
"Product_ID", "Revenue", "Quantity", "Location1", "Name"))

这是我的查询表:

dput(DF_Lookup_2)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234", 
"P1234", "P2345", "P2345", "P3456", NA), SO_ID = c("S1", "S1", 
"S1", "S2", "S2", "S3", "S4", "S7", "S10"), F_Year = c(2012, 
2012, 2012, 2013, 2013, 2011, 2011, 2014, 2015), Location1 = c("MA", 
"NY", "WA", "NY", "WA", "IL", "IL", "MN", "CA"), Name = c("N1", 
"N1", "N1", "N1", "N1", "N2", "N2", "N3", "N4")), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -9L), .Names = c("PO_ID", 
"SO_ID", "F_Year", "Location1", "Name"))

预期输出为:

dput(DFO)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234", 
"P1234", "P1234", "P1234", "P2345", "P2345", "P2345", "P3456", 
"P3456", NA, NA), SO_ID = c("S1", "S1", "S1", "S2", "S2", "S2", 
"S3", "S3", "S4", "S5", "S7", NA, "S10", "S10"), F_Year = c(2012, 
2012, 2012, 2013, 2013, 2013, 2013, 2011, 2011, 2012, 2014, 2014, 
2015, 2015), Product_ID = c("385X", "385X", "385X", "450X", "450X", 
"900X", "N9X", "3700", "3700", "3800", "A11U", "385X", "2700", 
"3700"), Revenue = c(16.6666666666667, 16.6666666666667, 16.6666666666667, 
35, 35, 35, 100, 100, -50, 20, 50, 20, 100, 10), Quantity = c(1, 
1, 1, 10, 10, 20, 20, 20, -10, 20, 20, 5, 40, 20), Location1 = c("MA", 
"NY", "WA", "NY", "WA", "NY", NA, "IL", "IL", NA, "MN", NA, "CA", 
"CA"), Name = c("N1", "N1", "N1", "N1", "N1", "N1", NA, "N2", 
"N2", NA, "N3", NA, "N4", "N4")), .Names = c("PO_ID", "SO_ID", 
"F_Year", "Product_ID", "Revenue", "Quantity", "Location1", "Name"
), row.names = c(NA, 14L), class = "data.frame")

逻辑:

查找基于三列完成:PO_IDSO_IDF_Year。如果找到匹配项,则仅当数据框需要更新时才会覆盖条目。例如,列Location1Name的行包含PO_ID = P1234SO_ID = S1F_Year = {不应覆盖{1}},因为它们的值已存在于查找表中。但是,2012 = Location1Name = PO_IDNA等行的SO_IDS10列中的条目= F_Year需要使用查找表中的有效值进行更新,该值分别为2015CA。我尝试使用N4但我的代码覆盖了所有条目,这是不正确的。

我读了Compare and merge two dataframes线程,并尝试这样做,但代码覆盖了我不需要查找的现有条目。

这是我的代码:

Data.Table

我有两个问题:

问题:1)我正在使用data.table::setDT(DF_Generated) data.table::setDT(DF_Lookup_2) data.table::setkey(DF_Generated,PO_ID,SO_ID,F_Year) data.table::setkey(DF_Lookup_2,PO_ID,SO_ID,F_Year) DF_Generated[DF_Lookup_2,on=c("PO_ID","SO_ID","F_Year"),c("Location1","Name"):=list(i.Location1,i.Name)] 因为我的实际数据很大。所以,我正在寻找基于Data.Table的解决方案。我该如何修复我的data.table代码?

问题2)如果是推荐方式,我也可以data.table开放。

但是,对于我的学习,如果你能帮我解决这两个问题,我真的很感激。我是初学者,还在学习这两个包。

2 个答案:

答案 0 :(得分:2)

这样做的另一个方法是(按条件)加入DF_Lookup_2然后再分配回DF_Generated。反过来这样做的原因是因为在和X[Y]连接中,结果连接的长度为Y因此,类型DF_Lookup_2[DF_Generated]的连接将为我们提供所需的长度。然后,我们可以按原样将其重新放回DF_Generated

DF_Generated[is.na(Location1) | is.na(Name), c("Location1", "Name"):= 
               DF_Lookup_2[DF_Generated[is.na(Location1) | is.na(Name)], 
                           .(Location1, Name), on = .(PO_ID, SO_ID, F_Year)]]


identical(DF_Generated, setDT(DFO))
## [1] TRUE

答案 1 :(得分:1)

我们可以在加入on' PO_ID',' SO_ID'和' F_Year'时创建两个新列,然后更新旧列使用set

的NA
setDT(DF_Generated)[setDT(DF_Lookup_2), c("Location1N", "NameN") := list(i.Location1, 
                               i.Name),on = .(PO_ID, SO_ID, F_Year)]
nm1 <- c("Location1", "Name")
nm2 <- paste0(nm1, "N")
for(j in seq_along(nm1)){
 set(DF_Generated, i = which(is.na(DF_Generated[[nm1[j]]])), j=nm1[j], 
                     value = DF_Generated[[nm2[j]]][is.na(df1[[nm1[j]]])])
}
DF_Generated[, (nm2) := NULL][]

identical(setDT(DFO), DF_Generated)
#[1] TRUE