我有一个很大的表要替换稀有值(在本例中,出现次数少于10,但实际情况更复杂-它可能有1000个级别,而我只希望有15个级别)。这个可能的级别列表可能会更改,因此我不想对任何内容进行硬编码。
我的代码如下:
%let var = Make;
proc sql;
create table stage1_ as
select &var.,
count(*) as count
from sashelp.cars
group by &var.
having count >= 10
order by count desc
;
quit;
/* Join table with table including only top obs to replace rare
values with "other" category */
proc sql;
create table stage2_ as
select t1.*,
case when t2.&var. is missing then "Other_&var." else t1.&var. end as &var._new
from sashelp.cars t1 left join
stage1_ t2 on t1.&var. = t2.&var.
;
quit;
/* Drop old variable and rename the new as old */
data result;
set stage2_(drop= &var.);
rename &var._new=&var.;
run;
它可以工作,但是不幸的是,它不是很有效,因为它需要为每个变量建立一个连接(实际上,我是在循环执行)。 有更好的方法吗?也许一些智能替换功能?
谢谢!
答案 0 :(得分:0)
您可能不想更改实际数据值。而是考虑为每个变量创建自定义格式,以将稀有值映射到“其他”类别。
FREQ
过程ODS可用于捕获列在单个表中的每个变量的计数和百分比。注意:Freq
table/out=
仅捕获最后列出的变量。这些计数可用于根据您要实现的“其他”规则构造格式。
data have;
do row = 1 to 1000;
array x x1-x10;
do over x;
if row < 600
then x = ceil(100*ranuni(123));
else x = ceil(150*ranuni(123));
end;
output;
end;
run;
ods output onewayfreqs=counts;
proc freq data=have ;
table x1-x10;
run;
data count_stack;
length name $32;
set counts;
array x x1-x10;
do over x;
name = vname(x);
value = x;
if value then output;
end;
keep name value frequency;
run;
proc sort data=count_stack;
by name descending frequency ;
run;
data cntlin;
do _n_ = 1 by 1 until (last.name);
set count_stack;
by name;
length fmtname $32;
fmtname = trim(name)||'top';
start = value;
label = cats(value);
if _n_ < 11 then output;
end;
hlo = 'O';
label = 'Other';
output;
run;
proc format cntlin=cntlin;
run;
ods html;
proc freq data=have;
table x1-x10;
format
x1 x1top.
x2 x2top.
x3 x3top.
x4 x4top.
x5 x5top.
x6 x6top.
x7 x7top.
x8 x8top.
x9 x9top.
x10 x10top.
;
run;