我需要创建一个名为new_id
的新变量,该变量为同一id
tasks
或同一location
tasks
显示相同的值。在这个例子中:
Table 1
id Task location
a Task1 lat1
b Task2 lat2
b Task3 lat1
c Task4 lat3
c Task5 lat4
d Task6 lat5
e Task7 lat5
Table want
id Task Location New_id
a Task1 lat1 a
b Task2 lat2 a
b Task3 lat1 a
c Task4 lat3 c
c Task5 lat4 c
d Task6 lat5 d
e Task7 lat5 d
Task1
和Task3
必须具有相同的new_id
,因为它们具有相同的location
。 Task2
和Task3
必须具有相同的new_id
,因为它们具有相同的id
。 我尝试使用retain
data step
。首先我按位置排序,保留first.variable,然后排序id,保留first.variable。
proc sort data=table1;
by location;
data table1_1;
set table1;
by location;
retain new_id_temp;
if first.location then new_id_temp =id;
new_id=new_id_temp;
run;
proc sort data=table1_1;
by id;
data table1_2;
set table1_1;
by id;
retain id_temp;
if first.id then id_temp=id;
new_id=id_temp;
run;
基于上面的代码,如果数据集很大,我还有两个不同的new_id
和proc sort
需要很多时间。
有人可以帮忙吗?
答案 0 :(得分:0)
您的问题是,您没有更新第二个数据分析符以使用new_id
作为保留ID的来源,因此它使用b
而不是{{1} }。
a
我不确定这是否是解决问题的有效方法,但它应该能为您提供所需的结果。您可能希望在网站(或网络)周围搜索解决此问题的其他方法,因为它是一个很好理解但很复杂的问题。
答案 1 :(得分:0)
这应该会给你你想要的结果,有一个警告:如果你要添加一个id为F且位置为lat1的Task8,你需要一个更精确的算法,有两个或更多的传球。但只要您的ID和位置以共享公共ID和/或位置的元素彼此放置的方式进展,此解决方案就可以正常工作。
data tasks;
input id $ Task $ location $;
datalines;
a Task1 lat1
b Task2 lat2
b Task3 lat1
c Task4 lat3
c Task5 lat4
d Task6 lat5
e Task7 lat5
;
PROC FREQ
proc freq data=tasks;
table id * task * location / out=combinations (drop=percent count);
run;
data newIDs;
set combinations;
length prev_id $ 1
newID $ 1
prev_location $ 4;
retain newID prev_id prev_location;
* First scenario - first row;
if _N_ = 1 then do;
put _N_= "First scenario - first row";
newID = id;
output;
prev_id = id;
prev_location = location;
end;
* Second scenario - some redundancy between 2 rows;
else if id = prev_id or prev_location=location then do;
put _N_= "Second Scenario - some redundancy";
output;
prev_id = id;
prev_location = location;
end;
* Third scenario - no redundancy;
else do;
put _N_= "Third scenario - no redundancy";
newID = id;
output;
prev_id = id;
prev_location = location;
end;
keep id task location newID;
run;
Tasks
数据集合并到newIDs
数据集proc sql;
create table tasks_update as
select t.id
,i.newID
,t.Task
,t.location
from tasks as t
left join newIDs as i
on t.id = i.id
and t.task = i.task
and t.location = i.location
order by id;
quit;
id newID Task location
a a Task1 lat1
b a Task2 lat2
b a Task3 lat1
c c Task4 lat3
c c Task5 lat4
d d Task6 lat5
e d Task7 lat5
答案 2 :(得分:0)
/ 为了帮助您理解算法,我打印了中间结果 /
%let print_diagnostics = 1; * 0 : no diagnostics 1 : diagnostics *;
/ 在示例中读取,使用额外数据进行扩展 /
options mprint;
title read input data;
data table1;
input id $ Task $ location $;
datalines;
a Task01 lat1
b Task02 lat2
b Tas0k3 lat1
b Task04 lat0
c Task05 lat3
c Task06 lat4
d Task07 lat5
e Task08 lat5
f Task09 lat4
f Task10 lat6
g Task11 lat6
g Task12 lat7
h Task13 lat7
;
proc print;
run;
/ 解决方案需要一些迭代,所以我们需要一个宏 /
%macro re_identify (got, want);
* Initially, we assign id to new_id *;
data &want.;
set &got.;
new_id = id;
run;
* proceed re-assigning ids until stabilised *;
%let pass = 0;
%let proceed = 1;
%do %while (&proceed);
/ 要查找已用于id或位置的最小new_id,我使用哈希表。有关详细信息,请参阅Data Step Hash Objects as Programming Tools /
* We will construct two hash tables
* one with the smallest new_id for each id and *
* one with the smallest new_id for each location *
* To achieve this, the smallest new_id should come first *;
%let pass = %eval(&pass + 1);
title pass &pass;
proc sort data=&want.;
by new_id;
run;
data
%if &print_diagnostics %then %do;
hash_id(keep=id id_id)
hash_loc(keep=location loc_id)
%end;
&want. (drop=rc loc_id id_id proceed);
/ 当然只需加载一次哈希表。注意数据变量的声明! /
* Create hash tables with for each id and location
* the smallest new_id used up to now *;
length loc_id id_id $ 1;
if _N_ eq 1 then do;
dcl hash h_id (dataset: "&want.(rename=(new_id=id_id))");
h_id.defineKey('id');
h_id.definedata('id_id','id');
h_id.defineDone();
dcl hash h_loc (dataset: "&want.(rename=(new_id=loc_id))");
h_loc.defineKey('location');
h_loc.definedata('loc_id','location');
h_loc.defineDone();
* Unless we have to lower the new id for any id or location,
* we can stop after this pass *;
proceed = 0;
end;
retain proceed;
* Read in the data *;
set &want. end=last;
* If there is a task with the same id or location
* with a smaller new_id, lower the new_id for this task *;
rc = h_id.find() + h_loc.find();
if rc then put 'WARNING: location not found' _all_;
if id_id lt new_id then new_id = id_id;
if loc_id lt new_id then new_id = loc_id;
output &want.;
* If we lowered the new_id,
* adapt the hash table
* and proceed after this pass *;
if id_id gt new_id then do;
id_id = new_id;
h_id.replace();
proceed = 1;
end;
if loc_id gt new_id then do;
loc_id = new_id;
h_loc.replace();
proceed = 1;
end;
/ 使用replace语句调整哈希表是可选的,但可以大大减少传递次数。 /
* transfer the the decision to proceed
* from a data step variable to a macro variable *;
if last then call symput ('proceed', proceed);
%if &print_diagnostics %then %do;
if last then do;
dcl hiter i_id ('h_id') ;
dcl hiter i_loc ('h_loc') ;
do rc = i_id.first () by 0 while ( rc = 0 ) ;
output hash_id;
rc = i_id.next () ;
end;
do rc = i_loc.first () by 0 while ( rc = 0 ) ;
output hash_loc;
rc = i_loc.next () ;
end;
put "NOTE: after pass &pass." proceed=;
end;
%end;
run;
%if &print_diagnostics %then %do;
* Print intermediate results *;
title2 new id assigned to task; proc print data=&want.; run;
title2 new id assigned to id; proc print data=hash_id; run;
title2 new id assigned to location; proc print data=hash_loc; run;
%end;
%end;
%mend;
%re_identify(table1, table_want);
/ 最后写出报告。 /
* sort in task order and print the final results *;
title final result;
proc sort data=table_want;
by Task;
proc print;
run;
/ * * /