Question

我需要创建一个名为new_id的新变量，该变量为同一id tasks或同一location tasks显示相同的值。在这个例子中：

Table 1
id Task location
a Task1 lat1
b Task2 lat2 
b Task3 lat1
c Task4 lat3
c Task5 lat4
d Task6 lat5
e Task7 lat5



Table want
id Task Location New_id
a Task1 lat1     a
b Task2 lat2     a
b Task3 lat1     a 
c Task4 lat3     c
c Task5 lat4     c
d Task6 lat5     d
e Task7 lat5     d

Task1和Task3必须具有相同的new_id，因为它们具有相同的location。
Task2和Task3必须具有相同的new_id，因为它们具有相同的id。

我尝试使用retain data step。首先我按位置排序，保留first.variable，然后排序id，保留first.variable。

proc sort data=table1;
    by location;
data table1_1;
    set table1;
    by location;

    retain new_id_temp;
    if first.location then new_id_temp =id; 
    new_id=new_id_temp;
run;

proc sort data=table1_1;
    by id;
data table1_2;
    set table1_1;
    by id;

    retain id_temp;
    if first.id then id_temp=id; 
    new_id=id_temp;
run;

基于上面的代码，如果数据集很大，我还有两个不同的new_id和proc sort需要很多时间。

有人可以帮忙吗？

Answer 1

您的问题是，您没有更新第二个数据分析符以使用new_id作为保留ID的来源，因此它使用b而不是{{1} }。

我不确定这是否是解决问题的有效方法，但它应该能为您提供所需的结果。您可能希望在网站（或网络）周围搜索解决此问题的其他方法，因为它是一个很好理解但很复杂的问题。

Answer 2

这应该会给你你想要的结果，有一个警告：如果你要添加一个id为F且位置为lat1的Task8，你需要一个更精确的算法，有两个或更多的传球。但只要您的ID和位置以共享公共ID和/或位置的元素彼此放置的方式进展，此解决方案就可以正常工作。

生成样本数据集

data tasks;
  input id $ Task $ location $;
  datalines;
a Task1 lat1
b Task2 lat2 
b Task3 lat1
c Task4 lat3
c Task5 lat4
d Task6 lat5
e Task7 lat5
;

使用`PROC FREQ`

生成所有可能的组合

proc freq data=tasks;
  table id * task * location / out=combinations (drop=percent count);
run;

根据您的标准计算新ID

data newIDs;
  set combinations;
  length prev_id $ 1
         newID $ 1
         prev_location $ 4;
  retain newID prev_id prev_location;

  * First scenario - first row;
  if _N_ = 1 then do;
    put _N_= "First scenario - first row";
    newID = id;
    output;
    prev_id = id;
    prev_location = location;
  end;

  * Second scenario - some redundancy between 2 rows;
  else if id = prev_id or prev_location=location then do;
    put _N_= "Second Scenario - some redundancy";
    output;
    prev_id = id;
    prev_location = location;
  end;

  * Third scenario - no redundancy;
  else do;
    put _N_= "Third scenario - no redundancy";
    newID = id;
    output;
    prev_id = id;
    prev_location = location;
  end;

  keep id task location newID;

run;

将`Tasks`数据集合并到`newIDs`数据集

proc sql;
  create table tasks_update as
    select  t.id
           ,i.newID
           ,t.Task
           ,t.location
      from tasks as t
        left join newIDs as i
               on t.id = i.id
                  and t.task = i.task
                  and t.location = i.location
      order by id;
quit;

结果

id newID Task  location 
a  a     Task1 lat1 
b  a     Task2 lat2 
b  a     Task3 lat1 
c  c     Task4 lat3 
c  c     Task5 lat4 
d  d     Task6 lat5 
e  d     Task7 lat5

Answer 3

/ 为了帮助您理解算法，我打印了中间结果 /

%let print_diagnostics = 1; * 0 : no diagnostics 1 : diagnostics *;

/ 在示例中读取，使用额外数据进行扩展 /

options mprint;

title read input data;
data table1;
  input id $ Task $ location $;
  datalines;
a Task01 lat1
b Task02 lat2
b Tas0k3 lat1
b Task04 lat0
c Task05 lat3
c Task06 lat4
d Task07 lat5
e Task08 lat5
f Task09 lat4
f Task10 lat6
g Task11 lat6
g Task12 lat7
h Task13 lat7
;
proc print;
run;

/ 解决方案需要一些迭代，所以我们需要一个宏 /

%macro re_identify (got, want);
    * Initially, we assign id to new_id *;
    data &want.;
        set &got.;
        new_id = id;
    run;

    * proceed re-assigning ids until stabilised *;
    %let pass = 0;
    %let proceed = 1;
    %do %while (&proceed);

/ 要查找已用于id或位置的最小new_id，我使用哈希表。有关详细信息，请参阅Data Step Hash Objects as Programming Tools /

        * We will construct two hash tables 
        * one with the smallest new_id for each id and *
        * one with the smallest new_id for each location *
        * To achieve this, the smallest new_id should come first *;
        %let pass = %eval(&pass + 1);
        title pass &pass;
        proc sort data=&want.;
            by new_id;
        run;

        data 
            %if &print_diagnostics %then %do;
                 hash_id(keep=id id_id)
                 hash_loc(keep=location loc_id)
             %end;
             &want. (drop=rc loc_id id_id proceed);

/ 当然只需加载一次哈希表。注意数据变量的声明！ /

             * Create hash tables with for each id and location 
             * the smallest new_id used up to now *;
            length loc_id id_id $ 1;
            if _N_ eq 1 then do;
                dcl hash h_id (dataset: "&want.(rename=(new_id=id_id))");
                h_id.defineKey('id');
                h_id.definedata('id_id','id');
                h_id.defineDone();

                dcl hash h_loc (dataset: "&want.(rename=(new_id=loc_id))");
                h_loc.defineKey('location');
                h_loc.definedata('loc_id','location');
                h_loc.defineDone();

                * Unless we have to lower the new id for any id or location, 
                * we can stop after this pass *;
                proceed = 0;
            end;
            retain proceed;

            * Read in the data *;
            set &want. end=last;

            * If there is a task with the same id or location
            * with a smaller new_id, lower the new_id for this task *;
            rc = h_id.find() + h_loc.find();
            if rc then put 'WARNING: location not found' _all_;
            if id_id lt new_id then new_id = id_id;
            if loc_id lt new_id then new_id = loc_id;
            output &want.;

            * If we lowered the new_id, 
            * adapt the hash table 
            * and proceed after this pass *;
            if id_id gt new_id then do;
                id_id = new_id;
                h_id.replace();
                proceed = 1;
            end;
            if loc_id gt new_id then do;
                loc_id = new_id;
                h_loc.replace();
                proceed = 1;
            end;

/ 使用replace语句调整哈希表是可选的，但可以大大减少传递次数。 /

            * transfer the the decision to proceed 
            * from a data step variable to a macro variable *;
            if last then call symput ('proceed', proceed);

            %if &print_diagnostics %then %do;
                if last then do;
                    dcl hiter i_id ('h_id') ; 
                    dcl hiter i_loc ('h_loc') ; 
                    do rc = i_id.first () by 0 while ( rc = 0 ) ;
                        output hash_id;
                        rc = i_id.next () ;
                    end;
                    do rc = i_loc.first () by 0 while ( rc = 0 ) ;
                        output hash_loc;
                        rc = i_loc.next () ;
                    end; 

                    put "NOTE: after pass &pass." proceed=;
                end;
            %end;
        run;

        %if &print_diagnostics %then %do;
            * Print intermediate results *;
            title2 new id assigned to task;     proc print data=&want.;   run;
            title2 new id assigned to id;       proc print data=hash_id;  run;
            title2 new id assigned to location; proc print data=hash_loc; run;
        %end;
    %end;
%mend;

%re_identify(table1, table_want);

/ 最后写出报告。 /

* sort in task order and print the final results *;
title final result;
proc sort data=table_want;
    by Task;
proc print;
run;

/ * * /

有条件地保留

3 个答案:

生成样本数据集

使用`PROC FREQ`

根据您的标准计算新ID

将`Tasks`数据集合并到`newIDs`数据集

结果

有条件地保留

3 个答案:

生成样本数据集

使用PROC FREQ

根据您的标准计算新ID

将Tasks数据集合并到newIDs数据集

结果

使用`PROC FREQ`

将`Tasks`数据集合并到`newIDs`数据集