我试图对数据集中的变量列表(收入,成本,利润和vcosts)进行简单检查,以获取每个变量的最大和第二大变量,检查它们的总数是否为大于变量总和的90%,如果是,则标记该变量。我还要检查最大变量是否不大于总和的60%。
我从Macro that outputs table with testing results of SAS table Macro that outputs table with testing results of SAS table获得了一些帮助,但现在我试图回答一个更基本的问题。这看起来并不难,但我无法弄清楚如何在最后设置基本表。
我知道所有变量名称。
以下是我创建的示例数据集:https://www.dropbox.com/s/x575w5d551uu47p/dataset%20%281%29.csv?dl=0
我想转这个基本表:
进入另一个这样的表:
/* Create some dummy data with three variables to assess */
data have;
do firm = 1 to 3;
revenue = rand("uniform");
costs = rand("uniform");
profits = rand("uniform");
vcost = rand("uniform");
output;
end;
run;
答案 0 :(得分:1)
这里的难点是为每个变量提取前2个值。这在SQL的大多数实现中都很简单,但在SAS中我并不认为proc sql
支持select top n
语法。
我可以想到几种可行的方法:
按感兴趣的每个变量按降序对数据集进行排序,从前2个观察值中检索值,转置并将它们全部附加在一起 - 由于多种排序而效率非常低,并且它不是比其他方法简单得多。
写一个(相当复杂的)数据步骤来提取每个变量的前2个值。
获取proc单变量为您提取最高值,然后将输出数据集转换为正确的格式。
数据步骤方法
data top2;
array v{4} revenue costs profits vcost;
array top1{4} (4*0);
array top2{4} (4*0);
set have end = eof;
do i = 1 to 4;
if v[i] > top1[i] then do;
top2[i] = top1[i];
top1[i] = v[i];
end;
if top2[i] < v[i] < top1[i] then top2[i] = v[i];
end;
length varname $8;
if eof then do i = 1 to 4;
varname = vname(v[i]);
top_1 = top1[i];
top_2 = top2[i];
top_2_total = top_1 + top_2;
output;
end;
keep varname top_:;
run;
采用单变量方法
ods _all_ close;
ods output extremeobs = extremeobs(keep = varname high);
proc univariate data = have(drop = firm);
run;
ods listing;
data top2_b;
set extremeobs;
by varname notsorted;
if first.varname then do;
i = 0;
call missing(top_2);
end;
i + 1;
retain top_2;
if i = 4 then top_2 = high;
if i = 5 then do;
top_1 = high;
top_2_total = top_1 + top_2;
output;
end;
drop i high;
run;
一旦你得到了这个,你就可以将它与proc means / proc summary中现有的简单表合并,并计算任何其他感兴趣的测量值。
答案 1 :(得分:1)
根据您对上一个答案的评论。看起来top_2_total是2个最大总值的总和。为此,您需要编写一些额外的步骤。我正在使用proc转置和datastep来获取前一个答案中已经实现的内容。我编写了PROC SUMMARY来获取前2个最大总值并重新使用数据集来创建最终答案。如果有帮助,请告诉我。
data have;
do firm = 1 to 3;
revenue = rand("uniform");
costs = rand("uniform");
profits = rand("uniform");
vcost = rand("uniform");
output;
end;
run;
proc transpose data=have out=want prefix=top_;
var revenue--vcost;
run;
data want;
set want end=eof;
array top(*) top_3-top_1;
call sortn(of top[*]);
total=sum(of top[*]);
run;
/* Getting the maximum 2 total values using PROC SUMMARY*/
proc summary data=want nway;
output out=total_top_2_rec(drop=_:) idgroup(max(total) out[2](total)=);
run;
data want;
/* Loop to get the values from previous step and generate TOP_2_TOTAL variable */
if _n_=1 then set total_top_2_rec;
top_2_total=sum(total_1,total_2);
set want;
if sum(top_1,top_2) > 0.9 * top_2_total then Flag90=1; else Flag90=0;
if top_1 > top_2_total * 0.6 then Flag60=1; else Flag60=0;
drop total_1 total_2;
run;
proc print data=want;run;
编辑:我在PROC TRANSPOSE之前添加了一个逻辑,您可以在其中添加变量以供计算考虑,其余部分由代码完成。在此之后,代码执行者不需要手动更改。变量应作为空格分隔列表输入。
data have;
infile 'C:\dataset (1).csv' missover dsd dlm=',' firstobs=2;
input firm v1 v2 v3;
run;
/* add/remove columns here to consider variable */
%let variable_to_consider=v1
v2
v3
;
%let variable_to_consider=%cmpres(&variable_to_consider);
proc sql noprint;
select count(*) into : obs_count from have;
quit;
%let obs_count=&obs_count;
proc transpose data=have out=want prefix=top_;
var &variable_to_consider;
run;
data want;
set want end=eof;
array top(*) top_&obs_count.-top_1;
x=dim(top);
call sortn(of top[*]);
total=sum(of top[*]);
keep total top_1 top_2 _name_;
run;
/* Getting the maximum 2 total values using PROC SUMMARY*/
proc summary data=want nway;
output out=total_top_2_rec(drop=_:) idgroup(max(total) out[2](total)=);
run;
data want;
/* Loop to get the values from previous step and generate TOP_2_TOTAL variable */
if _n_=1 then set total_top_2_rec;
top_2_total=sum(total_1,total_2);
set want;
if sum(top_1,top_2) > 0.9 * top_2_total then Flag90=1; else Flag90=0;
if top_1 > top_2_total * 0.6 then Flag60=1; else Flag60=0;
drop total_1 total_2;
run;
proc print data=want;run;
编辑2014-04-05:如上所述,我已更新逻辑并修复了问题。以下是更新后的代码。
data have1;
do firm = 1 to 3;
revenue = rand("uniform");
costs = rand("uniform");
profits = rand("uniform");
vcost = rand("uniform");
output;
end;
run;
data have2;
infile 'dataset (1).csv' missover dsd dlm=',' firstobs=2;
input firm v1 v2 v3;
run;
/* add/remove columns here to consider variable */
%macro mymacro(input_dataset= ,output_dataset=, variable_to_consider=);
%let variable_to_consider=%cmpres(&variable_to_consider);
proc sql noprint;
select count(*) into : obs_count from &input_dataset;
quit;
%let obs_count=&obs_count;
proc transpose data=&input_dataset out=&output_dataset prefix=top_;
var &variable_to_consider;
run;
data &output_dataset;
set &output_dataset end=eof;
array top(*) top_&obs_count.-top_1;
x=dim(top);
call sortn(of top[*]);
total=sum(of top[*]);
top_2_total=sum(top_1, top_2);
if sum(top_1,top_2) > 0.9 * total then Flag90=1; else Flag90=0;
if top_1 > total * 0.6 then Flag60=1; else Flag60=0;
keep total top_1 top_2 _name_ top_2_total total Flag60 Flag90;
run;
%mend mymacro;
%mymacro(input_dataset=have1, output_dataset=want1 ,variable_to_consider=revenue costs profits vcost)
%mymacro(input_dataset=have2, output_dataset=want2 ,variable_to_consider=v1 v2 v3 )
proc print data=want1;run;
proc print data=want2;run;
答案 2 :(得分:1)
对于分子大于或等于分母的值,最后一步中的flag1和flag2将为正整数,如果分子小于分母,则为零。
data have(drop=firm);
do firm = 1 to 4;
VarName = 'Variable';
revenue = rand("uniform");
costs = rand("uniform");
profits = rand("uniform");
vcost = rand("uniform");
output;
end;
run;
Proc Transpose data=have out=transout
name=Variable
prefix=Var_;
run;
options Mprint;
%Macro calcflag(Varlist);
proc sql;
create table outtable as
select Variable,
sum(&Varlist) as Sum_var,
Largest(1,&Varlist) as Top_1,
Largest(2,&Varlist) as Top_2,
sum(Largest(1,&Varlist),Largest(2,&Varlist)) as Top_2_total,
floor(sum(Largest(1,&Varlist),Largest(2,&Varlist))/(sum(&Varlist)*0.9)) as flag1,
floor(Largest(1,&Varlist)/(sum(&Varlist)*0.6)) as flag2
from transout;
quit;
%mend;
%calcflag(%str(Var_1,Var_2,Var_3,Var_4));