SAS:将变量中的稀有级别替换为新级别“其他”

时间:2019-04-18 14:04:44

标签: sas

我有一个很大的表要替换稀有值(在本例中,出现次数少于10,但实际情况更复杂-它可能有1000个级别,而我只希望有15个级别)。这个可能的级别列表可能会更改,因此我不想对任何内容进行硬编码。

我的代码如下:

%let var = Make;

    proc sql;

    create table stage1_ as
        select &var.,
               count(*) as count
        from sashelp.cars
        group by &var.
        having count >= 10 
        order by count desc

    ;
    quit;

    /* Join table with table including only top obs to replace rare
       values with "other" category */
    proc sql;
    create table stage2_ as 
        select t1.*,
                case when t2.&var. is missing then "Other_&var." else t1.&var. end as &var._new
        from sashelp.cars t1 left join 
             stage1_ t2 on t1.&var. = t2.&var.
    ;
    quit;

    /* Drop old variable and rename the new as old */
    data result;
        set stage2_(drop= &var.);
        rename &var._new=&var.;
    run;

它可以工作,但是不幸的是,它不是很有效,因为它需要为每个变量建立一个连接(实际上,我是在循环执行)。 有更好的方法吗?也许一些智能替换功能?

谢谢!

1 个答案:

答案 0 :(得分:0)

您可能不想更改实际数据值。而是考虑为每个变量创建自定义格式,以将稀有值映射到“其他”类别。

FREQ过程ODS可用于捕获列在单个表中的每个变量的计数和百分比。注意:Freq table/out=仅捕获最后列出的变量。这些计数可用于根据您要实现的“其他”规则构造格式。

data have;
  do row = 1 to 1000;
    array x x1-x10;
    do over x;
      if row < 600 
        then x = ceil(100*ranuni(123));
        else x = ceil(150*ranuni(123));
    end;
    output;
  end;
run;

ods output onewayfreqs=counts;

proc freq data=have ;
  table x1-x10;
run;

data count_stack;
  length name $32;
  set counts;
  array x x1-x10;
  do over x;
    name = vname(x);
    value = x;
    if value then output;
  end;
  keep name value frequency;
run;

proc sort data=count_stack;
  by name descending frequency ;
run;

data cntlin;
  do _n_ = 1 by 1 until (last.name);
    set count_stack;
    by name;
    length fmtname $32;
    fmtname = trim(name)||'top';
    start = value;
    label = cats(value);
    if _n_ < 11 then output;
  end;
  hlo = 'O';
  label = 'Other';
  output;
run;

proc format cntlin=cntlin;
run;

ods html;

proc freq data=have;
  table x1-x10;
  format
    x1 x1top.
    x2 x2top.
    x3 x3top.
    x4 x4top.
    x5 x5top.
    x6 x6top.
    x7 x7top.
    x8 x8top.
    x9 x9top.
    x10 x10top.
  ;
run;