变量检查和摘要

时间:2015-04-01 00:24:15

标签: sas

问题/疑问

我试图对数据集中的变量列表(收入,成本,利润和vcosts)进行简单检查,以获取每个变量的最大和第二大变量,检查它们的总数是否为大于变量总和的90%,如果是,则标记该变量。我还要检查最大变量是否不大于总和的60%。

我从Macro that outputs table with testing results of SAS table Macro that outputs table with testing results of SAS table获得了一些帮助,但现在我试图回答一个更基本的问题。这看起来并不难,但我无法弄清楚如何在最后设置基本表。

我知道所有变量名称。

以下是我创建的示例数据集:https://www.dropbox.com/s/x575w5d551uu47p/dataset%20%281%29.csv?dl=0

期望输出

我想转这个基本表:

enter image description here

进入另一个这样的表:

enter image description here

可重复的例子

/* Create some dummy data with three variables to assess */
data have;
    do firm = 1 to 3;
        revenue = rand("uniform");
        costs = rand("uniform");
        profits = rand("uniform");
        vcost = rand("uniform");
        output;
    end;
run;

3 个答案:

答案 0 :(得分:1)

这里的难点是为每个变量提取前2个值。这在SQL的大多数实现中都很简单,但在SAS中我并不认为proc sql支持select top n语法。

我可以想到几种可行的方法:

  1. 按感兴趣的每个变量按降序对数据集进行排序,从前2个观察值中检索值,转置并将它们全部附加在一起 ​​- 由于多种排序而效率非常低,并且它不是比其他方法简单得多。

  2. 写一个(相当复杂的)数据步骤来提取每个变量的前2个值。

  3. 获取proc单变量为您提取最高值,然后将输出数据集转换为正确的格式。

  4. 数据步骤方法

    data top2;
      array v{4} revenue costs profits vcost;
      array top1{4} (4*0);
      array top2{4} (4*0);
      set have end = eof;
      do i = 1 to 4;
        if v[i] > top1[i] then do;
          top2[i] = top1[i];
          top1[i] = v[i];
        end;
        if top2[i] < v[i] < top1[i] then top2[i] = v[i];
      end;
      length varname $8;
      if eof then do i = 1 to 4;
        varname = vname(v[i]);
        top_1    = top1[i];
        top_2    = top2[i];
        top_2_total = top_1 + top_2;
        output;
      end;
      keep varname top_:;
    run;
    

    采用单变量方法

    ods _all_ close;
    ods output extremeobs = extremeobs(keep = varname high);
    proc univariate data = have(drop = firm);
    run;
    ods listing;
    
    data top2_b;
        set extremeobs;
        by varname notsorted;
        if first.varname then do;
            i = 0;
            call missing(top_2);
        end;
        i + 1;
        retain top_2;
        if i = 4 then top_2 = high;
        if i = 5 then do;
            top_1 = high;
            top_2_total = top_1 + top_2;
            output;
        end;
        drop i high;
    run;
    

    一旦你得到了这个,你就可以将它与proc means / proc summary中现有的简单表合并,并计算任何其他感兴趣的测量值。

答案 1 :(得分:1)

根据您对上一个答案的评论。看起来top_2_total是2个最大总值的总和。为此,您需要编写一些额外的步骤。我正在使用proc转置和datastep来获取前一个答案中已经实现的内容。我编写了PROC SUMMARY来获取前2个最大总值并重新使用数据集来创建最终答案。如果有帮助,请告诉我。

data have;
    do firm = 1 to 3;
        revenue = rand("uniform");
        costs = rand("uniform");
        profits = rand("uniform");
        vcost = rand("uniform");
        output;
    end;
run;

proc transpose data=have out=want prefix=top_;
    var revenue--vcost;
run;

data want;
set want end=eof;
    array top(*) top_3-top_1;
    call sortn(of top[*]);
    total=sum(of top[*]);
run;
/* Getting the maximum 2 total values using PROC SUMMARY*/
proc summary data=want nway;
    output out=total_top_2_rec(drop=_:) idgroup(max(total) out[2](total)=);
run;

data want;
/* Loop to get the values from previous step and generate TOP_2_TOTAL variable */
if _n_=1 then set total_top_2_rec;
    top_2_total=sum(total_1,total_2);

set want;
    if sum(top_1,top_2) > 0.9  * top_2_total then Flag90=1; else Flag90=0;
    if top_1 > top_2_total * 0.6 then Flag60=1; else Flag60=0;

drop total_1 total_2;
run;

proc print data=want;run;

编辑:我在PROC TRANSPOSE之前添加了一个逻辑,您可以在其中添加变量以供计算考虑,其余部分由代码完成。在此之后,代码执行者不需要手动更改。变量应作为空格分隔列表输入。

data have;
infile 'C:\dataset (1).csv' missover dsd dlm=',' firstobs=2;
input firm v1 v2 v3;
run;

/* add/remove columns here to consider variable */
%let variable_to_consider=v1 
                          v2 
                          v3
                          ;

%let variable_to_consider=%cmpres(&variable_to_consider);
proc sql noprint;
  select count(*) into : obs_count from have;
quit;
%let obs_count=&obs_count;

proc transpose data=have out=want prefix=top_;
    var &variable_to_consider; 
run;

data want;
set want end=eof;
    array top(*) top_&obs_count.-top_1;
    x=dim(top);
    call sortn(of top[*]);
    total=sum(of top[*]);

keep total top_1 top_2 _name_;
run;

/* Getting the maximum 2 total values using PROC SUMMARY*/
proc summary data=want nway;
    output out=total_top_2_rec(drop=_:) idgroup(max(total) out[2](total)=);
run;

data want;
/* Loop to get the values from previous step and generate TOP_2_TOTAL variable */
if _n_=1 then set total_top_2_rec;
    top_2_total=sum(total_1,total_2);

set want;
    if sum(top_1,top_2) > 0.9  * top_2_total then Flag90=1; else Flag90=0;
    if top_1 > top_2_total * 0.6 then Flag60=1; else Flag60=0;

drop total_1 total_2;
run;

proc print data=want;run;

编辑2014-04-05:如上所述,我已更新逻辑并修复了问题。以下是更新后的代码。

data have1;
    do firm = 1 to 3;
        revenue = rand("uniform");
        costs = rand("uniform");
        profits = rand("uniform");
        vcost = rand("uniform");
        output;
    end;
run;

data have2;
infile 'dataset (1).csv' missover dsd dlm=',' firstobs=2;
input firm v1 v2 v3;
run;
/* add/remove columns here to consider variable */

%macro mymacro(input_dataset= ,output_dataset=, variable_to_consider=);

%let variable_to_consider=%cmpres(&variable_to_consider);
proc sql noprint;
  select count(*) into : obs_count from &input_dataset;
quit;
%let obs_count=&obs_count;

proc transpose data=&input_dataset out=&output_dataset prefix=top_;
    var &variable_to_consider; 
run;

data &output_dataset;
set &output_dataset end=eof;
    array top(*) top_&obs_count.-top_1;
    x=dim(top);
    call sortn(of top[*]);
    total=sum(of top[*]);

top_2_total=sum(top_1, top_2);
    if sum(top_1,top_2) > 0.9  * total then Flag90=1; else Flag90=0;
    if top_1 > total * 0.6 then Flag60=1; else Flag60=0;

keep total top_1 top_2 _name_ top_2_total total Flag60 Flag90;

run;
%mend mymacro;

%mymacro(input_dataset=have1, output_dataset=want1 ,variable_to_consider=revenue costs profits vcost)
%mymacro(input_dataset=have2, output_dataset=want2 ,variable_to_consider=v1 v2 v3 )


proc print data=want1;run;
proc print data=want2;run;

答案 2 :(得分:1)

对于分子大于或等于分母的值,最后一步中的flag1和flag2将为正整数,如果分子小于分母,则为零。

data have(drop=firm);
    do firm = 1 to 4;
        VarName = 'Variable';
        revenue = rand("uniform");
        costs = rand("uniform");
        profits = rand("uniform");
        vcost = rand("uniform");
        output;
    end;
run;

Proc Transpose data=have out=transout
name=Variable
prefix=Var_;
run;

options Mprint;

%Macro calcflag(Varlist);
proc sql;
create table outtable as
select Variable,
sum(&Varlist) as Sum_var,
Largest(1,&Varlist) as Top_1,
Largest(2,&Varlist) as Top_2,
sum(Largest(1,&Varlist),Largest(2,&Varlist)) as Top_2_total,
floor(sum(Largest(1,&Varlist),Largest(2,&Varlist))/(sum(&Varlist)*0.9)) as flag1,
floor(Largest(1,&Varlist)/(sum(&Varlist)*0.6)) as flag2 
from transout;
quit;
%mend;

%calcflag(%str(Var_1,Var_2,Var_3,Var_4));