总体目标是根据百分位数对变量进行分层。我想将它分解为10个等级(例如10th,20th,...... 100thcentntile),如果它落入第10个百分位则将其重新编码为1,如果它落入第20个百分位则重新编码为2,等等。此方法需要适用于我插入的任何数据集,我希望这个过程尽可能自动化。下面我已经生成了一些测试数据:
data test (drop=i);
do i=1 to 1000;
a=round(uniform(1)*4,.01);
b=round(uniform(1)*10,.01);
c=round(uniform(1)*7.5,.01);
output;
end;
stop;
run;
以下宏用于创建一个值表,告诉您每个变量的10个百分位的截止值。我在代码下面添加了示例输出的图片。
/*Recode variables based on quartiles from boxplot*/
%macro percentiles(var);
/* Count the number of values in the strinrecode */
%let count=%sysfunc(countw(&var));
/* Loop throurecodeh the total number of values */
%do i = 1 %to &count;
%let variables=%qscan(&var,&i,%str(,));
proc univariate data=test noprint;
var &variables;
output out=pcts pctlpts = 10 20 30 40 50 60 70 80 90 100
pctlpre = &variables;
run;
proc transpose data=pcts out=&variables (rename=(col1=&variables) drop=_NAME_ _LABEL_);
run;
%end;
data percentiles (drop=i);
do i=1 to 10;
recode=i;
percentile=i*10;
output;
end;
stop;
run;
data pcts;
merge percentiles %sysfunc(tranwrd(&var.,%str(,),%str( )));
run;
%mend;
%percentiles(%str(a,b,c));
以下代码是我目前正在重新编码变量的方式。我使用上面宏中生成的表来填充每个变量的每个百分位数的截止点。正如您所看到的,如果我有大量的变量需要重新编码,这将非常繁琐,并且会变得过高。是否有一个更好的过程,或者最好是一种自动化这个部分的方法?
data test;
set test;
if a <= .415 then recode_a = 1; else if a <= .785 then recode_a = 2; else if a <= 1.255 then recode_a = 3;
else if a <= 1.61 then recode_a = 4; else if a <= 2.03 then recode_a = 5; else if a <= 2.42 then recode_a = 6;
else if a <= 2.76 then recode_a = 7; else if a <= 3.18 then recode_a = 8; else if a <= 3.64 then recode_a = 9;
else if a <= 3.99 then recode_a = 10;
if b <= .845 then recode_b = 1; else if b <= 1.88 then recode_b = 2; else if b <= 2.86 then recode_b = 3;
else if b <= 4.005 then recode_b = 4; else if b <= 5.03 then recode_b = 5; else if b <= 6.07 then recode_b = 6;
else if b <= 6.995 then recode_b = 7; else if b <= 8.035 then recode_b = 8; else if b <= 9.16 then recode_b = 9;
else if b <= 10 then recode_b = 10;
if c <= .86 then recode_c = 1; else if c <= 1.58 then recode_c = 2; else if c <= 2.34 then recode_c = 3;
else if c <= 3.15 then recode_c = 4; else if c <= 3.85 then recode_c = 5; else if c <= 4.615 then recode_c = 6;
else if c <= 5.315 then recode_c = 7; else if c <= 5.96 then recode_c = 8; else if c <= 6.75 then recode_c = 9;
else if c <= 7.5 then recode_c = 10;
run;
proc print data=test (obs=5);
run;
答案 0 :(得分:1)
以下应该动态地为你工作而没有硬编码 - 我编辑将其压缩成一个宏。本质上,它将您想要的变量放入列表,使用输出创建数据集,然后使用变量内容将数据步骤放入长字符串。然后将这些字符串放入宏变量中,您可以在最终数据步骤中调用它。同样,不涉及硬编码。
%MACRO stratify(library=,input=,output=);
%local varlist varlist_space data_step_list;
** get vars into comma-separated list and space-separated list **;
proc sql noprint;
select NAME
into: varlist separated by ","
from dictionary.columns
where libname=upcase("&library.") and memname=upcase("&input.");
select NAME
into: varlist_space separated by " "
from dictionary.columns
where libname=upcase("&library.") and memname=upcase("&input.");
quit;
%percentiles(%bquote(&varlist.));
** put data into long format **;
proc transpose data = pcts out=pcts_long;
by recode percentile;
var &varlist_space.;
run;
** sort to get if-else order **;
proc sort data = pcts_long;
by _NAME_ percentile;
run;
** create your if-then strings using data itself **;
data str;
length STR $100;
set pcts_long;
bin = percentile/10;
by _NAME_;
if first._NAME_ then do;
STR = "if "||strip(_NAME_)||" <= "||strip(put(COL1,best.))||" then "||catx("_","recode",_NAME_)||" = "||strip(put(bin,best.))||";";
end;
else do;
STR = "else if "||strip(_NAME_)||" <= "||strip(put(COL1,best.))||" then "||catx("_","recode",_NAME_)||" = "||strip(put(bin,best.))||";";
end;
run;
** put strings into a list **;
proc sql noprint;
select STR
into: data_step_list separated by " "
from STR;
quit;
** call data step list in final data **;
data &output.; set &input.;
&data_step_list.;
run;
proc print data = &output.(obs=5);
run;
%MEND;
%stratify(library=work,input=test,output=final);
答案 1 :(得分:1)
另一种选择 - PROC RANK。你可能会让它更“自动化”,但它非常简单。使用PROC RANK,您还可以指定处理关系的不同方式。请注意,它将从0到9而不是1到10,但这很容易改变。
data test (drop=i);
do i=1 to 1000;
a=round(uniform(1)*4,.01);
b=round(uniform(1)*10,.01);
c=round(uniform(1)*7.5,.01);
output;
end;
stop;
run;
proc rank data=test out=want groups=10;
var a b c;
ranks rankA rankB rankC;
run;
答案 2 :(得分:0)
不需要所有代码生成。只需使用一个数组。基本上将PROC UNIVARIATE生成的数据集中的百分位数加载到二维数组中,然后找到实际值的十分位数。
%macro stratify(varlist,in=,out=,pcts=pcts);
%local nvars pctls droplist recodes ;
%let varlist=%sysfunc(compbl(&varlist));
%let nvars=%sysfunc(countw(&varlist));
%let pctls=pctl_%sysfunc(tranwrd(&varlist,%str( ),%str( pctl_)));
%let droplist=pctl_%sysfunc(tranwrd(&varlist,%str( ),%str(: pctl_))):;
%let recodes=recode_%sysfunc(tranwrd(&varlist,%str( ),%str( recode_)));
proc univariate data=&in noprint ;
var &varlist;
output out=&pcts pctlpre=&pctls
pctlpts = 10 20 30 40 50 60 70 80 90 100
;
run;
data want ;
if _n_=1 then set &pcts ;
array _pcts (10,&nvars) _numeric_;
set test;
array _in &varlist ;
array out &recodes ;
do i=1 to dim(_in);
do j=1 to 10 while(_in(i) > _pcts(j,i));
end;
out(i)=j;
end;
drop i j &droplist;
run;
%mend stratify;
因此,如果我在这里使用您生成的样本,那么在启用MPRINT选项的情况下,日志会是什么样子。
1093 %stratify(a b c,in=test,out=want);
MPRINT(STRATIFY): proc univariate data=test noprint ;
MPRINT(STRATIFY): var a b c;
MPRINT(STRATIFY): output out=pcts pctlpre=pctl_a pctl_b pctl_c pctlpts = 10 20 30 40 50
60 70 80 90 100 ;
MPRINT(STRATIFY): run;
NOTE: The data set WORK.PCTS has 1 observations and 30 variables.
NOTE: PROCEDURE UNIVARIATE used (Total process time):
real time 0.01 seconds
cpu time 0.01 seconds
MPRINT(STRATIFY): data want ;
MPRINT(STRATIFY): if _n_=1 then set pcts ;
MPRINT(STRATIFY): array _pcts (10,3) _numeric_;
MPRINT(STRATIFY): set test;
MPRINT(STRATIFY): array _in a b c ;
MPRINT(STRATIFY): array out recode_a recode_b recode_c ;
MPRINT(STRATIFY): do i=1 to dim(_in);
MPRINT(STRATIFY): do j=1 to 10 while(_in(i) > _pcts(j,i));
MPRINT(STRATIFY): end;
MPRINT(STRATIFY): out(i)=j;
MPRINT(STRATIFY): end;
MPRINT(STRATIFY): drop i j pctl_a: pctl_b: pctl_c:;
MPRINT(STRATIFY): run;
NOTE: There were 1 observations read from the data set WORK.PCTS.
NOTE: There were 1000 observations read from the data set WORK.TEST.
NOTE: The data set WORK.WANT has 1000 observations and 6 variables
前五个观察结果是: