我有一个数据框,其列需要根据查找表进行更新。查找表基于分组集。如果找不到匹配项,则要更新的值将保留为空白。
以下是我的输入数据:
dput(DF_Generated)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P1234", "P1234", "P2345", "P2345", "P2345", "P3456",
"P3456", NA, NA), SO_ID = c("S1", "S1", "S1", "S2", "S2", "S2",
"S3", "S3", "S4", "S5", "S7", NA, "S10", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2013, 2013, 2011, 2011, 2012, 2014, 2014,
2015, 2015), Product_ID = c("385X", "385X", "385X", "450X", "450X",
"900X", "N9X", "3700", "3700", "3800", "A11U", "385X", "2700",
"3700"), Revenue = c(16.6666666666667, 16.6666666666667, 16.6666666666667,
35, 35, 35, 100, 100, -50, 20, 50, 20, 100, 10), Quantity = c(1,
1, 1, 10, 10, 20, 20, 20, -10, 20, 20, 5, 40, 20), Location1 = c("MA",
"NY", "WA", "NY", "WA", "NY", NA, "IL", "IL", NA, "MN", NA, "CA",
NA), Name = c("N1", "N1", "N1", "N1", "N1", "N1", NA, "N2", "N2",
NA, "N3", NA, "N4", NA)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L), .Names = c("PO_ID", "SO_ID", "F_Year",
"Product_ID", "Revenue", "Quantity", "Location1", "Name"))
这是我的查询表:
dput(DF_Lookup_2)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P2345", "P2345", "P3456", NA), SO_ID = c("S1", "S1",
"S1", "S2", "S2", "S3", "S4", "S7", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2011, 2011, 2014, 2015), Location1 = c("MA",
"NY", "WA", "NY", "WA", "IL", "IL", "MN", "CA"), Name = c("N1",
"N1", "N1", "N1", "N1", "N2", "N2", "N3", "N4")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -9L), .Names = c("PO_ID",
"SO_ID", "F_Year", "Location1", "Name"))
预期输出为:
dput(DFO)
structure(list(PO_ID = c("P1234", "P1234", "P1234", "P1234",
"P1234", "P1234", "P1234", "P2345", "P2345", "P2345", "P3456",
"P3456", NA, NA), SO_ID = c("S1", "S1", "S1", "S2", "S2", "S2",
"S3", "S3", "S4", "S5", "S7", NA, "S10", "S10"), F_Year = c(2012,
2012, 2012, 2013, 2013, 2013, 2013, 2011, 2011, 2012, 2014, 2014,
2015, 2015), Product_ID = c("385X", "385X", "385X", "450X", "450X",
"900X", "N9X", "3700", "3700", "3800", "A11U", "385X", "2700",
"3700"), Revenue = c(16.6666666666667, 16.6666666666667, 16.6666666666667,
35, 35, 35, 100, 100, -50, 20, 50, 20, 100, 10), Quantity = c(1,
1, 1, 10, 10, 20, 20, 20, -10, 20, 20, 5, 40, 20), Location1 = c("MA",
"NY", "WA", "NY", "WA", "NY", NA, "IL", "IL", NA, "MN", NA, "CA",
"CA"), Name = c("N1", "N1", "N1", "N1", "N1", "N1", NA, "N2",
"N2", NA, "N3", NA, "N4", "N4")), .Names = c("PO_ID", "SO_ID",
"F_Year", "Product_ID", "Revenue", "Quantity", "Location1", "Name"
), row.names = c(NA, 14L), class = "data.frame")
逻辑:
查找基于三列完成:PO_ID
,SO_ID
,F_Year
。如果找到匹配项,则仅当数据框需要更新时才会覆盖条目。例如,列Location1
和Name
的行包含PO_ID
= P1234
,SO_ID
= S1
和F_Year
= {不应覆盖{1}},因为它们的值已存在于查找表中。但是,2012
= Location1
,Name
= PO_ID
和NA
等行的SO_ID
和S10
列中的条目= F_Year
需要使用查找表中的有效值进行更新,该值分别为2015
和CA
。我尝试使用N4
但我的代码覆盖了所有条目,这是不正确的。
我读了Compare and merge two dataframes线程,并尝试这样做,但代码覆盖了我不需要查找的现有条目。
这是我的代码:
Data.Table
我有两个问题:
问题:1)我正在使用data.table::setDT(DF_Generated)
data.table::setDT(DF_Lookup_2)
data.table::setkey(DF_Generated,PO_ID,SO_ID,F_Year)
data.table::setkey(DF_Lookup_2,PO_ID,SO_ID,F_Year)
DF_Generated[DF_Lookup_2,on=c("PO_ID","SO_ID","F_Year"),c("Location1","Name"):=list(i.Location1,i.Name)]
因为我的实际数据很大。所以,我正在寻找基于Data.Table
的解决方案。我该如何修复我的data.table
代码?
问题2)如果是推荐方式,我也可以data.table
开放。
但是,对于我的学习,如果你能帮我解决这两个问题,我真的很感激。我是初学者,还在学习这两个包。
答案 0 :(得分:2)
这样做的另一个方法是(按条件)加入DF_Lookup_2
然后再分配回DF_Generated
。反过来这样做的原因是因为在和X[Y]
连接中,结果连接的长度为Y
因此,类型DF_Lookup_2[DF_Generated]
的连接将为我们提供所需的长度。然后,我们可以按原样将其重新放回DF_Generated
。
DF_Generated[is.na(Location1) | is.na(Name), c("Location1", "Name"):=
DF_Lookup_2[DF_Generated[is.na(Location1) | is.na(Name)],
.(Location1, Name), on = .(PO_ID, SO_ID, F_Year)]]
identical(DF_Generated, setDT(DFO))
## [1] TRUE
答案 1 :(得分:1)
我们可以在加入on
' PO_ID',' SO_ID'和' F_Year'时创建两个新列,然后更新旧列使用set
setDT(DF_Generated)[setDT(DF_Lookup_2), c("Location1N", "NameN") := list(i.Location1,
i.Name),on = .(PO_ID, SO_ID, F_Year)]
nm1 <- c("Location1", "Name")
nm2 <- paste0(nm1, "N")
for(j in seq_along(nm1)){
set(DF_Generated, i = which(is.na(DF_Generated[[nm1[j]]])), j=nm1[j],
value = DF_Generated[[nm2[j]]][is.na(df1[[nm1[j]]])])
}
DF_Generated[, (nm2) := NULL][]
identical(setDT(DFO), DF_Generated)
#[1] TRUE