我正在努力将一些SAS程序迁移到R中,并且我很难找到实现这个看似简单逻辑的最佳方法。
proc sort data=df1; by cridsessid mode refresh_key; run;
proc sort data=df2; by cridsessid mode refresh_key; run;
data df3;
merge df1(in=xx) df2(in=yy);
by cridsessid mode refresh_key;
if xx and yy then do;
cridsessid=catx(':',cridsessid,refresh_key2);
end;
run;;
数据集很难,因为在数百万行中可能只有几千行实际匹配xx
和yy
,但基本输出如下所示:
DF1
transId_app mode sm_bdt sm_edt
1 c3bca1af-ed0d-4403-9552-29758055f7a3 None 21MAR2016:07:07:56.611 21MAR2016:07:07:56.627
2 68f85148-6b75-49dc-90f9-5bb66b6a750b None 21MAR2016:07:07:56.940 21MAR2016:07:07:56.940
3 68f85148-6b75-49dc-90f9-5bb66b6a750b None 21MAR2016:07:07:56.940 21MAR2016:07:07:56.955
4 68f85148-6b75-49dc-90f9-5bb66b6a750b None 21MAR2016:07:07:56.940 21MAR2016:07:07:56.940
5 68f85148-6b75-49dc-90f9-5bb66b6a750b None 21MAR2016:07:07:56.924 21MAR2016:07:07:56.940
6 7a0c53a3-00b4-4b81-8238-24a738e5f4ed None 21MAR2016:07:08:33.003 21MAR2016:07:08:33.003
mode_ cridsessid
1 Pass1 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2
2 Pass1 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2
3 Pass1 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2
4 Pass1 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2
5 Pass1 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2
6 Pass1 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2
refresh_key
1 7.125
2 7.125
3 7.125
4 7.125
5 7.125
6 7.142
dput()
structure(list(transId_app = c("c3bca1af-ed0d-4403-9552-29758055f7a3",
"68f85148-6b75-49dc-90f9-5bb66b6a750b", "68f85148-6b75-49dc-90f9-5bb66b6a750b",
"68f85148-6b75-49dc-90f9-5bb66b6a750b", "68f85148-6b75-49dc-90f9-5bb66b6a750b",
"7a0c53a3-00b4-4b81-8238-24a738e5f4ed", "7a0c53a3-00b4-4b81-8238-24a738e5f4ed",
"7a0c53a3-00b4-4b81-8238-24a738e5f4ed", "7a0c53a3-00b4-4b81-8238-24a738e5f4ed"
), mode = c("None", "None", "None", "None", "None", "None", "None",
"None", "None"), sm_bdt = c("21MAR2016:07:07:56.611", "21MAR2016:07:07:56.940",
"21MAR2016:07:07:56.940", "21MAR2016:07:07:56.940", "21MAR2016:07:07:56.924",
"21MAR2016:07:08:33.003", "21MAR2016:07:08:33.003", "21MAR2016:07:08:32.988",
"21MAR2016:07:08:32.957"), sm_edt = c("21MAR2016:07:07:56.627",
"21MAR2016:07:07:56.940", "21MAR2016:07:07:56.955", "21MAR2016:07:07:56.940",
"21MAR2016:07:07:56.940", "21MAR2016:07:08:33.003", "21MAR2016:07:08:33.003",
"21MAR2016:07:08:33.003", "21MAR2016:07:08:33.003"), mode_ = c("Pass1",
"Pass1", "Pass1", "Pass1", "Pass1", "Pass1", "Pass1", "Pass1",
"Pass1"), cridsessid = c("0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2"
), refresh_key = c(7.125, 7.125, 7.125, 7.125, 7.125, 7.142,
7.142, 7.142, 7.142)), .Names = c("transId_app", "mode", "sm_bdt",
"sm_edt", "mode_", "cridsessid", "refresh_key"), class = "data.frame", row.names = c(NA,
9L))
DF2
mode
1 None
2 None
3 LazyLoadUncached
4 None
5 LazyLoadUncached
6 None
cridsessid refresh_key_
1 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2 7.142+0
2 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2 7.142+0
3 00e8a4c5-904d-46a4-a7b1-e4e6383bdd10:bf97d617-fbe8-4c4d-ab0e-c00d48bce120 0.317+0
4 00e8a4c5-904d-46a4-a7b1-e4e6383bdd10:bf97d617-fbe8-4c4d-ab0e-c00d48bce120 0.317+0
5 02063ca4-ccf3-4326-b87d-fe5ab13d2d7f:07783670-5ace-47bc-a707-db5b8064e241 5.658+17
6 02063ca4-ccf3-4326-b87d-fe5ab13d2d7f:07783670-5ace-47bc-a707-db5b8064e241 5.658+17
refresh_key refresh_key2
1 7.142 7.142
2 7.142 7.142
3 0.317 0.317
4 0.317 0.317
5 5.658 5.658
6 5.658 5.658
dput()
structure(list(mode = c("None", "None", "LazyLoadUncached", "None",
"LazyLoadUncached", "None"), cridsessid = c("0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2",
"00e8a4c5-904d-46a4-a7b1-e4e6383bdd10:bf97d617-fbe8-4c4d-ab0e-c00d48bce120",
"00e8a4c5-904d-46a4-a7b1-e4e6383bdd10:bf97d617-fbe8-4c4d-ab0e-c00d48bce120",
"02063ca4-ccf3-4326-b87d-fe5ab13d2d7f:07783670-5ace-47bc-a707-db5b8064e241",
"02063ca4-ccf3-4326-b87d-fe5ab13d2d7f:07783670-5ace-47bc-a707-db5b8064e241"
), refresh_key_ = c("7.142+0", "7.142+0", "0.317+0", "0.317+0",
"5.658+17", "5.658+17"), refresh_key = c(7.142, 7.142, 0.317,
0.317, 5.658, 5.658), refresh_key2 = c(7.142, 7.142, 0.317, 0.317,
5.658, 5.658)), .Names = c("mode", "cridsessid", "refresh_key_",
"refresh_key", "refresh_key2"), row.names = c(NA, 6L), class = "data.frame")
DF3
transId_app mode sm_bdt sm_edt
1 7a0c53a3-00b4-4b81-8238-24a738e5f4ed None 21MAR2016:07:08:33.003 21MAR2016:07:08:33.003
2 7a0c53a3-00b4-4b81-8238-24a738e5f4ed None 21MAR2016:07:08:33.003 21MAR2016:07:08:33.003
3 7a0c53a3-00b4-4b81-8238-24a738e5f4ed None 21MAR2016:07:08:32.988 21MAR2016:07:08:33.003
4 7a0c53a3-00b4-4b81-8238-24a738e5f4ed None 21MAR2016:07:08:32.957 21MAR2016:07:08:33.003
mode_ cridsessid
1 Pass1 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2:7.142
2 Pass1 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2:7.142
3 Pass1 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2:7.142
4 Pass1 0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2:7.142
refresh_key refresh_key_ refresh_key2
1 7.142 7.142+0 7.142
2 7.142 7.142+0 7.142
3 7.142 7.142+0 7.142
4 7.142 7.142+0 7.142
dput()
structure(list(transId_app = c("7a0c53a3-00b4-4b81-8238-24a738e5f4ed",
"7a0c53a3-00b4-4b81-8238-24a738e5f4ed", "7a0c53a3-00b4-4b81-8238-24a738e5f4ed",
"7a0c53a3-00b4-4b81-8238-24a738e5f4ed"), mode = c("None", "None",
"None", "None"), sm_bdt = c("21MAR2016:07:08:33.003", "21MAR2016:07:08:33.003",
"21MAR2016:07:08:32.988", "21MAR2016:07:08:32.957"), sm_edt = c("21MAR2016:07:08:33.003",
"21MAR2016:07:08:33.003", "21MAR2016:07:08:33.003", "21MAR2016:07:08:33.003"
), mode_ = c("Pass1", "Pass1", "Pass1", "Pass1"), cridsessid = c("0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2:7.142",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2:7.142",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2:7.142",
"0043dfb7-9a98-4b7c-9c04-5afc05580843:96530555-3568-468c-9bb4-ddd77278f1b2:7.142"
), refresh_key = c(7.142, 7.142, 7.142, 7.142), refresh_key_ = c("7.142+0",
"7.142+0", "7.142+0", "7.142+0"), refresh_key2 = c(7.142, 7.142,
7.142, 7.142)), .Names = c("transId_app", "mode", "sm_bdt", "sm_edt",
"mode_", "cridsessid", "refresh_key", "refresh_key_", "refresh_key2"
), row.names = c(NA, 4L), class = "data.frame")
我知道dplyr中的semi_join()
,并认为它可以有效地替换if xx and yy
,但semi_join()
会返回一个data.frame,我不知道怎么会得到一个逻辑矢量,我可以用来做这样的事情:
df <- full_join(df1, df2, by="cridsessid", "mode", "refresh_key")
x <- {logical vector}
df[x, "cridsessid"] <- df[x, paste("cridsessid", "mode", sep=":")]
答案 0 :(得分:0)
在没有一小部分输入集和输出数据集的情况下,SAS会返回什么,这有点不清楚。 (SAS合并会返回什么样的连接?)我会想到xx和yy对于内连接都是TRUE。然后,您就可以完成对粘贴的var1
,“:”,var1
值var2
的分配。我没有使用inner_join
,因为这是基函数merge
的默认操作:
df3 <- merge(df1, df2, by="var1", "var2", "var3")
df3["var1"] <- with(df3, paste0(var1, var4, sep=":"))