删除SAS中的类似行

时间:2012-03-20 09:38:19

标签: sas duplicate-removal 4gl

我有两张结构相似的桌子:
- 第一个表:id和col1,col2,col3 - 所有数字。
- 第二个表:id和col4,col5,col6 - 所有数字。

我想从第一个中删除所有与第二个tagble中的任何行相似的行。当col1-col3组中的任何列等于col4-col6组中的任何列时,我认为一行与其他行类似。现在我在9个连续的数据步骤中进行(首先检查col1 = col4,第二个col1 = col5,...,第九个col3 = col6),这可能不是最佳解决方案。

任何想法如何改善这个?

2 个答案:

答案 0 :(得分:2)

这是我的解决方案:

data vec1;
  set ds2;
  array cvar{*} col4 col5 col6;
  do ijk=1 to dim(cvar);
    compvar=cvar(ijk);
    output;
  end;
run;

proc sql noprint;
  select distinct compvar into :cvars separated by ' '
  from vec1;
quit;
%let numcvar=&sqlobs;

data ds1(drop=i);
  set ds1;
  array myvar(i) col:;
  do over myvar;
    if myvar in (&cvars.) then delete;
  end;
run;

如果您遇到CVARS宏变量长度的问题,可以改用它:

data vec1;
  set ds2;
  array cvar{*} col:;
  do ijk=1 to dim(cvar);
    compvar=cvar(ijk);
    output;
  end;
run;

proc sort data=vec1 out=vec2(keep=compvar) nodupkey;
  by compvar;
run;

proc transpose data=vec2 out=flat prefix=x;
run;

data ds1(keep=id col:);
  set ds1b;
  if _n_=1 then set flat;
  array myvar(i) col:;
  array xvar(j) x:;
  do over myvar;
    do over xvar;
      if myvar=xvar then delete;
    end;
  end;
run;

可以消除PROC SORT,但它可以提高大数据集的效率。

或者您可以动态生成格式:

data vec1;
  set ds2;
  array cvar{*} col4 col5 col6;
  do ijk=1 to dim(cvar);
    compvar=cvar(ijk);
    output;
  end;
run;

proc sort data=vec1 out=vec2 nodupkey;
  by compvar;
run;

data fmt1;
  set vec2;
  length start $20;
  fmtname="remobs";
  start=compress(put(compvar,best.));
  label="remove";
run;

proc format lib=work cntlin=fmt1;
run;

data ds1(drop=i);
  set ds1;
  array myvar(i) col:;
  do over myvar;
    if put(myvar,remobs.)="remove" then delete;
  end;
run;

我怀疑最后一种方法比前两种解决方案更快。

更新

使用哈希对象

data vec1;
  set ds2;
  array cvar{*} col4 col5 col6;
  do ijk=1 to dim(cvar);
    compvar=cvar(ijk);
    output;
  end;
run;

proc sort data=vec1 out=vec2 nodupkey;
  by compvar;
run;

data ds1_new(keep=id col1 col2 col3);
  if _n_ = 0 then set work.vec2;
  declare hash myhash(DATASET:'work.vec2') ; 
  rc=myhash.defineKey('compvar'); 
  myhash.defineDone();
  set ds1;
  array rcarr{*} rc1-rc3;
  array lookup{*} col1 col2 col3;
  do i=1 to dim(lookup);
    rcarr(i)=myhash.find(key: lookup(i));
    if rcarr(i)=0 then delete;
  end;
run;

答案 1 :(得分:1)

好的,第二次尝试回答这个问题。我已经创建了2个数据集的笛卡尔连接,以便将表1中的每一行与表2中的每一行相匹配。然后,您可以使用这些数组来找出哪些行具有重复值。

data ds1;
input id col1 col2 col3;
cards;
1   10  20  30
2   40  50  60
3   70  80  90
4   15  25  35
5   45  55  65
;
run;

data ds2;
input id col4 col5 col6;
cards;
10  100 200 300
12  60  50  600
13  700 800 70
16  15  20  300
;
run;

proc sql;
create view all_cols as select
ds1.id as id1, ds2.id as id2,* from ds1,ds2;
quit;

data match;
set all_cols (keep=id1 id2 col:);
array vars1{*} col1-col3;
array vars2{*} col4-col6;
do i=1 to dim(vars1);
do j=1 to dim(vars2);
    if vars1{i}=vars2{j} then do;
    output;
    return;
    end;
end;
end;
drop i j;
run;

proc sort data=match;
by id1;
run;

data ds1;
modify ds1 match (in=b keep=id1 rename=(id1=id));
by id;
if b then remove;
run;