Question

总体目标是根据百分位数对变量进行分层。我想将它分解为10个等级（例如10th，20th，...... 100thcentntile），如果它落入第10个百分位则将其重新编码为1，如果它落入第20个百分位则重新编码为2，等等。此方法需要适用于我插入的任何数据集，我希望这个过程尽可能自动化。下面我已经生成了一些测试数据：

data test (drop=i);
do i=1 to 1000;
a=round(uniform(1)*4,.01);
b=round(uniform(1)*10,.01);
c=round(uniform(1)*7.5,.01);
output;
end;
stop;
run;

以下宏用于创建一个值表，告诉您每个变量的10个百分位的截止值。我在代码下面添加了示例输出的图片。

/*Recode variables based on quartiles from boxplot*/
%macro percentiles(var);                                                                                                           
     /* Count the number of values in the strinrecode */                                                                                                                                   
     %let count=%sysfunc(countw(&var)); 
     /* Loop throurecodeh the total number of values */                                                                                         
     %do i = 1 %to &count;                                                                                                              
      %let variables=%qscan(&var,&i,%str(,));
proc univariate data=test noprint;
   var &variables;
   output out=pcts pctlpts  = 10 20 30 40 50 60 70 80 90 100
                    pctlpre  = &variables;
run;
proc transpose data=pcts out=&variables (rename=(col1=&variables) drop=_NAME_ _LABEL_);
run;                                                                                                                      
     %end; 
data percentiles (drop=i);
do i=1 to 10;
recode=i;
percentile=i*10;
output;
end;
stop;
run;

data pcts;
merge percentiles %sysfunc(tranwrd(&var.,%str(,),%str( ))); 
run;
%mend;  
%percentiles(%str(a,b,c));

output from above macro

以下代码是我目前正在重新编码变量的方式。我使用上面宏中生成的表来填充每个变量的每个百分位数的截止点。正如您所看到的，如果我有大量的变量需要重新编码，这将非常繁琐，并且会变得过高。是否有一个更好的过程，或者最好是一种自动化这个部分的方法？

data test;
set test;
if a <= .415 then recode_a = 1; else if a <= .785 then recode_a = 2; else if a <= 1.255 then recode_a = 3; 
else if a <= 1.61 then recode_a = 4;   else if a <= 2.03 then recode_a = 5; else if a <= 2.42 then recode_a = 6;   
else if a <= 2.76 then recode_a = 7; else if a <= 3.18 then recode_a = 8; else if a <= 3.64 then recode_a = 9; 
else if a <= 3.99 then recode_a = 10;   
if b <= .845 then recode_b = 1; else if b <= 1.88 then recode_b = 2; else if b <= 2.86 then recode_b = 3; 
else if b <= 4.005 then recode_b = 4;   else if b <= 5.03 then recode_b = 5; else if b <= 6.07 then recode_b = 6;   
else if b <= 6.995 then recode_b = 7; else if b <= 8.035 then recode_b = 8; else if b <= 9.16 then recode_b = 9; 
else if b <= 10 then recode_b = 10;  
if c <= .86 then recode_c = 1; else if c <= 1.58 then recode_c = 2; else if c <= 2.34 then recode_c = 3; 
else if c <= 3.15 then recode_c = 4;   else if c <= 3.85 then recode_c = 5; else if c <= 4.615 then recode_c = 6;   
else if c <= 5.315 then recode_c = 7; else if c <= 5.96 then recode_c = 8; else if c <= 6.75 then recode_c = 9; 
else if c <= 7.5 then recode_c = 10;
run; 

proc print data=test (obs=5);
run;

sample of desired output

Answer 1

以下应该动态地为你工作而没有硬编码 - 我编辑将其压缩成一个宏。本质上，它将您想要的变量放入列表，使用输出创建数据集，然后使用变量内容将数据步骤放入长字符串。然后将这些字符串放入宏变量中，您可以在最终数据步骤中调用它。同样，不涉及硬编码。

%MACRO stratify(library=,input=,output=);
%local varlist varlist_space data_step_list;

    ** get vars into comma-separated list and space-separated list **;
    proc sql noprint;
        select NAME
        into: varlist separated by ","
        from dictionary.columns
        where libname=upcase("&library.") and memname=upcase("&input.");

        select NAME
        into: varlist_space separated by " "
        from dictionary.columns
        where libname=upcase("&library.") and memname=upcase("&input.");
    quit;

    %percentiles(%bquote(&varlist.)); 

    ** put data into long format **;
    proc transpose data = pcts out=pcts_long;
        by recode percentile;
        var &varlist_space.;
    run;

    ** sort to get if-else order **;
    proc sort data = pcts_long;
        by _NAME_ percentile;
    run;

    ** create your if-then strings using data itself **;
    data str; 
        length STR $100;
        set pcts_long;
        bin = percentile/10;
        by _NAME_;
        if first._NAME_ then do;
            STR = "if "||strip(_NAME_)||" <= "||strip(put(COL1,best.))||" then "||catx("_","recode",_NAME_)||" = "||strip(put(bin,best.))||";";
        end;
        else do;
            STR = "else if "||strip(_NAME_)||" <= "||strip(put(COL1,best.))||" then "||catx("_","recode",_NAME_)||" = "||strip(put(bin,best.))||";";
        end;
    run; 

    ** put strings into a list **;
    proc sql noprint;
        select STR
        into: data_step_list separated by " "
        from STR;
    quit;

    ** call data step list in final data **;
    data &output.; set &input.;
        &data_step_list.;
    run;

    proc print data = &output.(obs=5);
    run;

%MEND;

%stratify(library=work,input=test,output=final);

Answer 2

另一种选择 - PROC RANK。你可能会让它更“自动化”，但它非常简单。使用PROC RANK，您还可以指定处理关系的不同方式。请注意，它将从0到9而不是1到10，但这很容易改变。

data test (drop=i);
do i=1 to 1000;
a=round(uniform(1)*4,.01);
b=round(uniform(1)*10,.01);
c=round(uniform(1)*7.5,.01);
output;
end;
stop;
run;

proc rank data=test out=want groups=10;
var a b c;
ranks rankA rankB rankC;
run;

Answer 3

不需要所有代码生成。只需使用一个数组。基本上将PROC UNIVARIATE生成的数据集中的百分位数加载到二维数组中，然后找到实际值的十分位数。

%macro stratify(varlist,in=,out=,pcts=pcts);
%local nvars pctls droplist recodes ;
%let varlist=%sysfunc(compbl(&varlist));
%let nvars=%sysfunc(countw(&varlist));
%let pctls=pctl_%sysfunc(tranwrd(&varlist,%str( ),%str( pctl_)));
%let droplist=pctl_%sysfunc(tranwrd(&varlist,%str( ),%str(: pctl_))):;
%let recodes=recode_%sysfunc(tranwrd(&varlist,%str( ),%str( recode_)));

proc univariate data=&in noprint ;
  var &varlist;
  output out=&pcts pctlpre=&pctls
         pctlpts = 10 20 30 40 50 60 70 80 90 100 
  ;
run;

data want ;
  if _n_=1 then set &pcts ;
  array _pcts (10,&nvars) _numeric_;
  set test;
  array _in &varlist ;
  array out &recodes ;
  do i=1 to dim(_in);
    do j=1 to 10 while(_in(i) > _pcts(j,i)); 
    end;
    out(i)=j;
  end;
  drop i j &droplist;
run;
%mend stratify;

因此，如果我在这里使用您生成的样本，那么在启用MPRINT选项的情况下，日志会是什么样子。

1093  %stratify(a b c,in=test,out=want);
MPRINT(STRATIFY):   proc univariate data=test noprint ;
MPRINT(STRATIFY):   var a b c;
MPRINT(STRATIFY):   output out=pcts pctlpre=pctl_a pctl_b pctl_c pctlpts = 10 20 30 40 50 
60 70 80 90 100 ;
MPRINT(STRATIFY):   run;

NOTE: The data set WORK.PCTS has 1 observations and 30 variables.
NOTE: PROCEDURE UNIVARIATE used (Total process time):
      real time           0.01 seconds
      cpu time            0.01 seconds


MPRINT(STRATIFY):   data want ;
MPRINT(STRATIFY):   if _n_=1 then set pcts ;
MPRINT(STRATIFY):   array _pcts (10,3) _numeric_;
MPRINT(STRATIFY):   set test;
MPRINT(STRATIFY):   array _in a b c ;
MPRINT(STRATIFY):   array out recode_a recode_b recode_c ;
MPRINT(STRATIFY):   do i=1 to dim(_in);
MPRINT(STRATIFY):   do j=1 to 10 while(_in(i) > _pcts(j,i));
MPRINT(STRATIFY):   end;
MPRINT(STRATIFY):   out(i)=j;
MPRINT(STRATIFY):   end;
MPRINT(STRATIFY):   drop i j pctl_a: pctl_b: pctl_c:;
MPRINT(STRATIFY):   run;

NOTE: There were 1 observations read from the data set WORK.PCTS.
NOTE: There were 1000 observations read from the data set WORK.TEST.
NOTE: The data set WORK.WANT has 1000 observations and 6 variables

前五个观察结果是：

如何使用表2中的信息（在SAS中）重新编码表1中的变量

3 个答案: