Question

问题/疑问

我试图对数据集中的变量列表（收入，成本，利润和vcosts）进行简单检查，以获取每个变量的最大和第二大变量，检查它们的总数是否为大于变量总和的90％，如果是，则标记该变量。我还要检查最大变量是否不大于总和的60％。

我从Macro that outputs table with testing results of SAS table Macro that outputs table with testing results of SAS table获得了一些帮助，但现在我试图回答一个更基本的问题。这看起来并不难，但我无法弄清楚如何在最后设置基本表。

我知道所有变量名称。

以下是我创建的示例数据集：https://www.dropbox.com/s/x575w5d551uu47p/dataset%20%281%29.csv?dl=0

期望输出

我想转这个基本表：

enter image description here

进入另一个这样的表：

enter image description here

可重复的例子

/* Create some dummy data with three variables to assess */
data have;
    do firm = 1 to 3;
        revenue = rand("uniform");
        costs = rand("uniform");
        profits = rand("uniform");
        vcost = rand("uniform");
        output;
    end;
run;

Answer 1

这里的难点是为每个变量提取前2个值。这在SQL的大多数实现中都很简单，但在SAS中我并不认为proc sql支持select top n语法。

我可以想到几种可行的方法：

按感兴趣的每个变量按降序对数据集进行排序，从前2个观察值中检索值，转置并将它们全部附加在一起 - 由于多种排序而效率非常低，并且它不是比其他方法简单得多。
写一个（相当复杂的）数据步骤来提取每个变量的前2个值。
获取proc单变量为您提取最高值，然后将输出数据集转换为正确的格式。

数据步骤方法

data top2;
  array v{4} revenue costs profits vcost;
  array top1{4} (4*0);
  array top2{4} (4*0);
  set have end = eof;
  do i = 1 to 4;
    if v[i] > top1[i] then do;
      top2[i] = top1[i];
      top1[i] = v[i];
    end;
    if top2[i] < v[i] < top1[i] then top2[i] = v[i];
  end;
  length varname $8;
  if eof then do i = 1 to 4;
    varname = vname(v[i]);
    top_1    = top1[i];
    top_2    = top2[i];
    top_2_total = top_1 + top_2;
    output;
  end;
  keep varname top_:;
run;

采用单变量方法

ods _all_ close;
ods output extremeobs = extremeobs(keep = varname high);
proc univariate data = have(drop = firm);
run;
ods listing;

data top2_b;
    set extremeobs;
    by varname notsorted;
    if first.varname then do;
        i = 0;
        call missing(top_2);
    end;
    i + 1;
    retain top_2;
    if i = 4 then top_2 = high;
    if i = 5 then do;
        top_1 = high;
        top_2_total = top_1 + top_2;
        output;
    end;
    drop i high;
run;

一旦你得到了这个，你就可以将它与proc means / proc summary中现有的简单表合并，并计算任何其他感兴趣的测量值。

Answer 2

根据您对上一个答案的评论。看起来top_2_total是2个最大总值的总和。为此，您需要编写一些额外的步骤。我正在使用proc转置和datastep来获取前一个答案中已经实现的内容。我编写了PROC SUMMARY来获取前2个最大总值并重新使用数据集来创建最终答案。如果有帮助，请告诉我。

data have;
    do firm = 1 to 3;
        revenue = rand("uniform");
        costs = rand("uniform");
        profits = rand("uniform");
        vcost = rand("uniform");
        output;
    end;
run;

proc transpose data=have out=want prefix=top_;
    var revenue--vcost;
run;

data want;
set want end=eof;
    array top(*) top_3-top_1;
    call sortn(of top[*]);
    total=sum(of top[*]);
run;
/* Getting the maximum 2 total values using PROC SUMMARY*/
proc summary data=want nway;
    output out=total_top_2_rec(drop=_:) idgroup(max(total) out[2](total)=);
run;

data want;
/* Loop to get the values from previous step and generate TOP_2_TOTAL variable */
if _n_=1 then set total_top_2_rec;
    top_2_total=sum(total_1,total_2);

set want;
    if sum(top_1,top_2) > 0.9  * top_2_total then Flag90=1; else Flag90=0;
    if top_1 > top_2_total * 0.6 then Flag60=1; else Flag60=0;

drop total_1 total_2;
run;

proc print data=want;run;

编辑：我在PROC TRANSPOSE之前添加了一个逻辑，您可以在其中添加变量以供计算考虑，其余部分由代码完成。在此之后，代码执行者不需要手动更改。变量应作为空格分隔列表输入。

data have;
infile 'C:\dataset (1).csv' missover dsd dlm=',' firstobs=2;
input firm v1 v2 v3;
run;

/* add/remove columns here to consider variable */
%let variable_to_consider=v1 
                          v2 
                          v3
                          ;

%let variable_to_consider=%cmpres(&variable_to_consider);
proc sql noprint;
  select count(*) into : obs_count from have;
quit;
%let obs_count=&obs_count;

proc transpose data=have out=want prefix=top_;
    var &variable_to_consider; 
run;

data want;
set want end=eof;
    array top(*) top_&obs_count.-top_1;
    x=dim(top);
    call sortn(of top[*]);
    total=sum(of top[*]);

keep total top_1 top_2 _name_;
run;

/* Getting the maximum 2 total values using PROC SUMMARY*/
proc summary data=want nway;
    output out=total_top_2_rec(drop=_:) idgroup(max(total) out[2](total)=);
run;

data want;
/* Loop to get the values from previous step and generate TOP_2_TOTAL variable */
if _n_=1 then set total_top_2_rec;
    top_2_total=sum(total_1,total_2);

set want;
    if sum(top_1,top_2) > 0.9  * top_2_total then Flag90=1; else Flag90=0;
    if top_1 > top_2_total * 0.6 then Flag60=1; else Flag60=0;

drop total_1 total_2;
run;

proc print data=want;run;

编辑2014-04-05：如上所述，我已更新逻辑并修复了问题。以下是更新后的代码。

data have1;
    do firm = 1 to 3;
        revenue = rand("uniform");
        costs = rand("uniform");
        profits = rand("uniform");
        vcost = rand("uniform");
        output;
    end;
run;

data have2;
infile 'dataset (1).csv' missover dsd dlm=',' firstobs=2;
input firm v1 v2 v3;
run;
/* add/remove columns here to consider variable */

%macro mymacro(input_dataset= ,output_dataset=, variable_to_consider=);

%let variable_to_consider=%cmpres(&variable_to_consider);
proc sql noprint;
  select count(*) into : obs_count from &input_dataset;
quit;
%let obs_count=&obs_count;

proc transpose data=&input_dataset out=&output_dataset prefix=top_;
    var &variable_to_consider; 
run;

data &output_dataset;
set &output_dataset end=eof;
    array top(*) top_&obs_count.-top_1;
    x=dim(top);
    call sortn(of top[*]);
    total=sum(of top[*]);

top_2_total=sum(top_1, top_2);
    if sum(top_1,top_2) > 0.9  * total then Flag90=1; else Flag90=0;
    if top_1 > total * 0.6 then Flag60=1; else Flag60=0;

keep total top_1 top_2 _name_ top_2_total total Flag60 Flag90;

run;
%mend mymacro;

%mymacro(input_dataset=have1, output_dataset=want1 ,variable_to_consider=revenue costs profits vcost)
%mymacro(input_dataset=have2, output_dataset=want2 ,variable_to_consider=v1 v2 v3 )


proc print data=want1;run;
proc print data=want2;run;

Answer 3

对于分子大于或等于分母的值，最后一步中的flag1和flag2将为正整数，如果分子小于分母，则为零。

data have(drop=firm);
    do firm = 1 to 4;
        VarName = 'Variable';
        revenue = rand("uniform");
        costs = rand("uniform");
        profits = rand("uniform");
        vcost = rand("uniform");
        output;
    end;
run;

Proc Transpose data=have out=transout
name=Variable
prefix=Var_;
run;

options Mprint;

%Macro calcflag(Varlist);
proc sql;
create table outtable as
select Variable,
sum(&Varlist) as Sum_var,
Largest(1,&Varlist) as Top_1,
Largest(2,&Varlist) as Top_2,
sum(Largest(1,&Varlist),Largest(2,&Varlist)) as Top_2_total,
floor(sum(Largest(1,&Varlist),Largest(2,&Varlist))/(sum(&Varlist)*0.9)) as flag1,
floor(Largest(1,&Varlist)/(sum(&Varlist)*0.6)) as flag2 
from transout;
quit;
%mend;

%calcflag(%str(Var_1,Var_2,Var_3,Var_4));

变量检查和摘要

问题/疑问

期望输出

可重复的例子

3 个答案: