我有两张结构相似的桌子:
- 第一个表:id和col1,col2,col3 - 所有数字。
- 第二个表:id和col4,col5,col6 - 所有数字。
我想从第一个中删除所有与第二个tagble中的任何行相似的行。当col1-col3组中的任何列等于col4-col6组中的任何列时,我认为一行与其他行类似。现在我在9个连续的数据步骤中进行(首先检查col1 = col4,第二个col1 = col5,...,第九个col3 = col6),这可能不是最佳解决方案。
任何想法如何改善这个?
答案 0 :(得分:2)
这是我的解决方案:
data vec1;
set ds2;
array cvar{*} col4 col5 col6;
do ijk=1 to dim(cvar);
compvar=cvar(ijk);
output;
end;
run;
proc sql noprint;
select distinct compvar into :cvars separated by ' '
from vec1;
quit;
%let numcvar=&sqlobs;
data ds1(drop=i);
set ds1;
array myvar(i) col:;
do over myvar;
if myvar in (&cvars.) then delete;
end;
run;
如果您遇到CVARS宏变量长度的问题,可以改用它:
data vec1;
set ds2;
array cvar{*} col:;
do ijk=1 to dim(cvar);
compvar=cvar(ijk);
output;
end;
run;
proc sort data=vec1 out=vec2(keep=compvar) nodupkey;
by compvar;
run;
proc transpose data=vec2 out=flat prefix=x;
run;
data ds1(keep=id col:);
set ds1b;
if _n_=1 then set flat;
array myvar(i) col:;
array xvar(j) x:;
do over myvar;
do over xvar;
if myvar=xvar then delete;
end;
end;
run;
可以消除PROC SORT,但它可以提高大数据集的效率。
或者您可以动态生成格式:
data vec1;
set ds2;
array cvar{*} col4 col5 col6;
do ijk=1 to dim(cvar);
compvar=cvar(ijk);
output;
end;
run;
proc sort data=vec1 out=vec2 nodupkey;
by compvar;
run;
data fmt1;
set vec2;
length start $20;
fmtname="remobs";
start=compress(put(compvar,best.));
label="remove";
run;
proc format lib=work cntlin=fmt1;
run;
data ds1(drop=i);
set ds1;
array myvar(i) col:;
do over myvar;
if put(myvar,remobs.)="remove" then delete;
end;
run;
我怀疑最后一种方法比前两种解决方案更快。
使用哈希对象
data vec1;
set ds2;
array cvar{*} col4 col5 col6;
do ijk=1 to dim(cvar);
compvar=cvar(ijk);
output;
end;
run;
proc sort data=vec1 out=vec2 nodupkey;
by compvar;
run;
data ds1_new(keep=id col1 col2 col3);
if _n_ = 0 then set work.vec2;
declare hash myhash(DATASET:'work.vec2') ;
rc=myhash.defineKey('compvar');
myhash.defineDone();
set ds1;
array rcarr{*} rc1-rc3;
array lookup{*} col1 col2 col3;
do i=1 to dim(lookup);
rcarr(i)=myhash.find(key: lookup(i));
if rcarr(i)=0 then delete;
end;
run;
答案 1 :(得分:1)
好的,第二次尝试回答这个问题。我已经创建了2个数据集的笛卡尔连接,以便将表1中的每一行与表2中的每一行相匹配。然后,您可以使用这些数组来找出哪些行具有重复值。
data ds1;
input id col1 col2 col3;
cards;
1 10 20 30
2 40 50 60
3 70 80 90
4 15 25 35
5 45 55 65
;
run;
data ds2;
input id col4 col5 col6;
cards;
10 100 200 300
12 60 50 600
13 700 800 70
16 15 20 300
;
run;
proc sql;
create view all_cols as select
ds1.id as id1, ds2.id as id2,* from ds1,ds2;
quit;
data match;
set all_cols (keep=id1 id2 col:);
array vars1{*} col1-col3;
array vars2{*} col4-col6;
do i=1 to dim(vars1);
do j=1 to dim(vars2);
if vars1{i}=vars2{j} then do;
output;
return;
end;
end;
end;
drop i j;
run;
proc sort data=match;
by id1;
run;
data ds1;
modify ds1 match (in=b keep=id1 rename=(id1=id));
by id;
if b then remove;
run;