我想保留所有重复的ID,除非由于缺少值而没有新信息。例如,
data test;
input id var1 var2 var3
datalines;
1 2 3 4
1 4 . 4
1 6 5 4
1 . 3 .
1 2 4 4
1 6 . 4
1 . 8 4
;
run;
我希望结果是
1 2 3 4
1 4 . 4
1 6 5 4
1 2 4 4
1 . 8 4
因为第1行具有相同的ID,即var2和var3,所以删除了第4行。由于第3行具有相同的ID,即var1,var3,因此删除了第6行。我还需要一个健壮的解决方案,因为我希望该解决方案适用于数据集中的任意数量的变量(id始终是唯一键)。
有什么想法吗?我当时在考虑对nodupkey进行排序,但是如果连续存在多个缺失值,那么它将不起作用。
答案 0 :(得分:3)
您可以提取缺少一个或多个矿石的行。 第二步,您必须生成RegExp,以帮助您识别相似的行。
您的建议可能会改善代码。
data test;
input id var1 var2 var3;
datalines;
1 2 3 4
1 4 . 4
1 6 5 4
1 . 3 .
1 2 4 4
1 6 . 4
1 . 8 4
;
run;
data test2 missing;
/*incrase this strings if you have big values*/
length res $ 200 addedEl $ 10;
set test;
array num _NUMERIC_;
/*add flag to determine is there missin in row*/
flag=0;
do i=1 to dim(num);
addedEl=compress(put(num(i),8.));
if num(i)=. then
do;
flag=1;
/*template for number. If you have comma separated vars then replace on \d+\.\d* */
addedEl="\d+";
end;
/*add delimeter to row parse, if you have more than one digits in vars =)*/
res=catx("_",res,addedEl);
end;
if flag=0 then output test2;
else do;
res=catt("/",res,"/");
output missing;
end;
drop i flag addedEl;
run;
/*determine rows that dublicates*/
proc sql noprint;
create table matched as
select B.*
,prxparse(B.res) as prxm
,A.*
from test2 as A
,missing as B
where prxmatch(calculated prxm,A.res)
order by B.res;
quit;
run;
/*pre-merge sort*/
proc sort data=missing;
by res;
run;
/*delete rows that are in second dataset*/
data miss_correctred;
merge missing(in=mss)
matched(in=mtch)
;
by res;
if mss=1 and mtch=0;
run;
data test_res(drop=prxm res);
set test2 miss_correctred;
run;
结果:
+----+------+------+------+
| id | var1 | var2 | var3 |
+----+------+------+------+
| 1 | 2 | 3 | 4 |
| 1 | 6 | 5 | 4 |
| 1 | 2 | 4 | 4 |
| 1 | 4 | . | 4 |
| 1 | . | 8 | 4 |
+----+------+------+------+
答案 1 :(得分:2)
这是单个数据步骤哈希+双DOW方法的概述:
我认为这是最坏情况下的O(n ^ 4),但是如果重复项的比例很高,那么它应该会做得更好。
这是一个示例实现-这确实很混乱:
proc sql noprint;
select
quote(trim(name)),
name,
count(name)
into
:varlist separated by ',',
:arraylist separated by ' ',
:varcount
from dictionary.columns
where
libname = 'WORK'
and memname = 'TEST'
and type = 'num'
and name ne 'id'
;
quit;
data want;
/*Set up arrays*/
if 0 then set test;
array vars[*] &arraylist;
array temp[&varcount] _temporary_;
length sub_id 8;
keep id &arraylist;
/*Set up hash + iterator*/
if _n_ = 1 then do;
declare hash h(ordered:'a');
rc = h.definekey('sub_id', &varlist);
rc = h.definedata('sub_id', &varlist);
rc = h.definedone();
declare hiter hi('h');
end;
/*DOW #1 - load hash and output definite non-duplicates*/
do _n_ = 1 by 1 until(last.id);
set test;
by id;
/*We need a way to keep track of rows within each id so that we don't count rows as duplicates when they match themselves in DOW #2*/
sub_id = _n_;
rc = h.add();
if rc = 0 and nmiss(of vars[*]) = 0 then output;
end;
/*DOW #2 - check for any previously unseen pairs of values*/
do _n_ = 1 to _n_;
set test;
/*Make a copy of the current row to retrieve after looping through the hash iterator*/
do i = 1 to dim(vars);
temp[i] = vars[i];
end;
if nmiss(of vars[*]) > 0 then do;
dup_flag = 1;
/*Work through successive pairs of values*/
do i = 1 to dim(vars) while(dup_flag = 1);
do j = 1 to i - 1 while(dup_flag = 1);
__v_i = temp[i];
__v_j = temp[j];
match_flag = 0;
/*For each pair, loop through the iterator until we find a 'match'*/
rc = hi.first();
do while(rc = 0 and match_flag = 0 and sub_id < _n_);
if (missing(__v_i) or __v_i = vars[i])
and (missing(__v_j) or __v_j = vars[j])
then match_flag = 1;
rc = hi.next();
end;
/*If we didn't find a match, we have a new combination and the row is not a duplicate*/
if match_flag = 0 then dup_flag = 0;
end;
end;
if dup_flag = 0 then do;
do i = 1 to dim(vars);
vars[i] = temp[i];
end;
output;
end;
end;
end;
rc = h.clear();
run;