Question

我有一个带有时间间隔数据的数据集。它看起来像这样：

date  | person | shift_start | shift_end | activity_start | activity_end | activity
10JAN | Joe    | 8:00        | 16:00     | 10:00          | 11:00        | training
10JAN | Joe    | 8:00        | 16:00     | 13:00          | 14:00        | meeting
11JAN | Joe    | 8:00        | 16:00     | 8:00           | 11:00        | dragoning
11JAN | Joe    | 8:00        | 16:00     | 13:00          | 14:00        | wizardry

我想要做的就是通过表格并“填补空白”。对于上面的数据，我想添加以下行

date  | person | shift_start | shift_end | activity_start | activity_end | activity
10JAN | Joe    | 8:00        | 16:00     | 8:00           | 10:00        | default
10JAN | Joe    | 8:00        | 16:00     | 11:00          | 13:00        | default
10JAN | Joe    | 8:00        | 16:00     | 14:00          | 16:00        | default
11JAN | Joe    | 8:00        | 16:00     | 11:00          | 13:00        | default
11JAN | Joe    | 8:00        | 16:00     | 14:00          | 16:00        | default

如您所见，我需要为每个日期和人员添加可能的行数。我不确定是否可以以这种方式在datastep中添加行，其中继续处理传入数据。而且，即使这得到支持，我也不确定我是如何实现我想要的。这就是我的想法：

data fill_gaps;
retain prev_date 
       prev_default_activity_end
       prev_default_activity_start;
    if(date <> prev_date) then do; 
    /*different shift than previous row */
        if(shift_start = activity_start) then do; 
        /* the newly created activity start time should be the 
           activity end time of this row, but the new activity end 
           time cannot be determined without looking at the next row */
            default_activity_start = activity_end;
        else do;
        /* the new activity start and end time can be determined */
            default_activity_start = shift_start;
            default_activity_end = activity_start;
        end;
    else do;
    /* same shift as previous row */
        default_activity_start = prev_default_activity_start;
        default_activity_end = activity_start;
    end;
    prev_date = date;
    prev_default_activity_end = default_activity_end;
    prev_default_activity_start = default_activity_start;
run;

然后可能需要更多数据步骤来提取填充default_activity_start和default_activity_end的行，并将这些行（使用新列）附加到原始表中。

这对我来说似乎很苛刻，我实际上还没有机会测试它（对不起，我知道这听起来很懒！）。有没有更优雅的方式来做到这一点？

感恩！

Answer 1

这是使用lag的解决方案，假设您的输入数据集名为test。此解决方案填补了空白并输出了原始行，有关详细信息，请参阅注释：

/* Must sort by person,date to use by-group processing */
proc sort data=test;
  by person date;
run;

data fill_gaps (drop=_:);
  _new_row=0;
  set test;

   /*Hold the previous activity end time*/  
   _laen=lag(activity_end);
   by person date;

/*Conditions such that a new row should be inserted */
if (shift_start < activity_start and first.date) or
_laen < activity_start
or (last.date and activity_end < shift_end) then do;

   /* Output current row */
   output;
   /*Build interim row and output */
   activity_end = activity_start;
   if first.date then activity_start = shift_start;
   else activity_start = _laen;
   activity = 'default';
   _new_row=1;
   output;

    /* If we get to the last date - output the end record */ 
   if last.date and activity_end < shift_end then do;
      activity_start = activity_end;
      activity_end = shift_end;
      output;
    end;
end;
else output;
run;

输出无法正确排序，您需要按person, date, activity_start排序。

使用了临时变量，所有变量都以下划线为前缀。要从输出数据集中删除这些，请删除数据集选项中drop=_:周围的注释。

Answer 2

哈希对象是最方便的基于键搜索或不向前或向后操作数据的工作室。这是一个哈希选项。对于每个数据步骤迭代，它将当前行上传到Hash，在分析并填补空白后，它下载行和输出。如果数据按原样进入，则无需排序。

data have;
    infile cards dlm='|';
    input  (date   person) (:$8.)  (shift_start  shift_end  activity_start  activity_end ) (:time8.) activity :$20.;
    format shift_start  shift_end  activity_start  activity_end :time8.;
    cards;
10JAN | Joe    | 8:00        | 16:00     | 10:00          | 11:00        | training
10JAN | Joe    | 8:00        | 16:00     | 13:00          | 14:00        | meeting
11JAN | Joe    | 8:00        | 16:00     | 8:00           | 11:00        | dragoning
11JAN | Joe    | 8:00        | 16:00     | 13:00          | 14:00        | wizardry
;

data want;
    if _n_=1 then
        do;
            dcl hash h();
            h.definekey('person');
            h.definedata('date','person', 'shift_start', 'shift_end',  'activity_start',  'activity_end', 'activity');
            h.definedone();
        end;

    set have;
    by person date notsorted;
    rc=h.add();
    lag_end=lag(activity_end);

    if first.date and shift_start < activity_start then
        do;
            activity_end=activity_start;
            activity_start=shift_start;
            activity='default';
            output;
        end;
    else if lag_end < activity_start then
        do;
            activity_end=activity_start;
            activity_start=lag_end;
            activity='default';
            output;
        end;

    rc=h.find();
    output;
    rc=h.clear();

    if last.date and activity_end < shift_end then
        do;
            activity_start=activity_end;
            activity_end=shift_end;
            activity='default';
            output;
        end;

    drop rc lag_end;
run;

每次迭代SAS数据步骤

2 个答案: