我有一个给定的数据集:
Policy_Number,var1,var2,var3,Exposure
1,B,H,J,191
2,B,F,Unknown,174
3,C,Unknown,I,153
4,B,G,L,192
5,Unknown,E,Unknown,184
6,D,E,K,113
7,C,Unknown,I,140
8,A,H,I,133
9,C,F,I,194
10,Unknown,G,Unknown,105
11,B,H,L,172
12,A,Unknown,I,198
13,D,E,K,155
14,Unknown,G,K,177
15,B,H,Unknown,100
16,D,Unknown,J,176
17,B,E,I,112
18,Unknown,E,J,192
19,C,Unknown,K,146
20,C,G,Unknown,187
我想通过使用PROC Means或Summary将给定数据转换为以下形式:
Variables Levels Tot_Exposures
Var1 A 331
Var1 B 941
Var1 C ...
Var1 D ...
Var1 Unknown ...
Var2 E ...
Var2 F ...
Var2 G ...
Var2 H ...
Var2 Unknown ...
Var3 I ...
Var3 J ...
Var3 K ...
Var3 L ...
Var3 Unknown ...
Tot_Exposure返回我想要此汇总表的每个VariableName的总曝光量。请帮帮我。
编辑:我已尝试过proc手段方法,但我想让它一步完成。我分3步完成了。得到像图像一样的输出。代码如下:
data try2;
infile 'complex.csv' dsd dlm = ',' FIRSTOBS = 2;
Length Policy_Number Var1 $ 10 Var2 $ 10 Var3 $ 10 Exposure 3;
input Policy_Number $ Var1 $ Var2 $ Var3 $ Exposure;
run;
proc sort data = try2;
by Exposure;
run;
proc means data = try2 SUM;
class Var1;
var exposure;
output out = want;
title ' Var1';
run;
proc means data = try2 SUM;
class Var2;
var exposure;
output out = want2;
title 'Var2';
run;
proc means data = try2 SUM;
class Var3;
var exposure;
output out = want3;
title 'Var3';
run;
答案 0 :(得分:0)
很抱歉我把旧答案带到了一个新帖子,但是根据你对特定proc摘要的需要,我的旧方法会为你计算。
如果你将最后一个SQL部分(我刚将其添加)换成:
proc sql;
create table OUT as
select VARIABLENAME
, VARIABLEVALUE
, sum(EXPOSURE)
from
GET_MAX
group by 1,2
;quit;
答案 1 :(得分:0)
您可以在一个PROC SUMMARY步骤中汇总所有三个变量,但输出并不完全符合您的指定。但是,这可以通过PROC SUMMARY输出的数据步骤操作来实现。我在CLASS语句中使用MLF选项将所有CLASS变量“转换”为字符。你没有,但它很有用,因为类变量可以是数字的字符。
data exp;
infile cards dsd firstobs=2;
input Policy_Number (var1-var3) ($) Exposure;
cards;
Policy_Number,var1,var2,var3,Exposure
1,B,H,J,191
2,B,F,Unknown,174
3,C,Unknown,I,153
4,B,G,L,192
5,Unknown,E,Unknown,184
6,D,E,K,113
7,C,Unknown,I,140
8,A,H,I,133
9,C,F,I,194
10,Unknown,G,Unknown,105
11,B,H,L,172
12,A,Unknown,I,198
13,D,E,K,155
14,Unknown,G,K,177
15,B,H,Unknown,100
16,D,Unknown,J,176
17,B,E,I,112
18,Unknown,E,J,192
19,C,Unknown,K,146
20,C,G,Unknown,187
;;;;
run;
proc summary data=exp descendtypes chartype;
class var: / mlf;
ways 1;
freq Exposure;
output out=test(rename=(_freq_=TotExposures));
run;
data want;
length variable $32 levels $8;
set test;
array v[*] var1-var3;
drop var1-var3 i _type_;
i = indexc(_type_,'1');
variable = vname(v[i]);
levels = v[i];
run;
答案 2 :(得分:0)
这将轻松处理350个变量,4500万条记录需要一段时间,但PROC SUMMARY也可以轻松处理。需要定义变量LEVELS,其长度等于或大于所有类变量的最长格式化值。你可以问另外一个问题是怎么做的。
data exp;
infile cards dsd firstobs=2;
input Policy_Number (var1-var3) ($) Exposure;
arbitraryname243 = rank(first(var1));
arbitraryname4 = rantbl(123,.4);
arbitraryname36 = rank(first(var3));
cards;
Policy_Number,var1,var2,var3,Exposure
1,B,H,J,191
2,B,F,Unknown,174
3,C,Unknown,I,153
4,B,G,L,192
5,Unknown,E,Unknown,184
6,D,E,K,113
7,C,Unknown,I,140
8,A,H,I,133
9,C,F,I,194
10,Unknown,G,Unknown,105
11,B,H,L,172
12,A,Unknown,I,198
13,D,E,K,155
14,Unknown,G,K,177
15,B,H,Unknown,100
16,D,Unknown,J,176
17,B,E,I,112
18,Unknown,E,J,192
19,C,Unknown,K,146
20,C,G,Unknown,187
;;;;
run;
proc transpose data=exp(obs=0 drop=policy_number exposure) out=varlist;
var _all_;
run;
Proc sql noprint;
select nliteral(_name_) into :classvars separated by ' ' from varlist;
quit;
%put NOTE: &=classvars;
proc summary data=exp descendtypes chartype;
class &classvars / mlf;
ways 1;
freq Exposure;
output out=test(rename=(_freq_=TotExposures));
run;
data want(keep=Variable levels totexposures);
length variable $32 levels $8;
set test;
array v[*] &classvars;
i = indexc(_type_,'1');
variable = vname(v[i]);
levels = v[i];
run;