我有一个项目,我经常使用数据和设置合并数据集。
data want;
set have1 have2;
run;
问题是,这些数据集通常是相同的,但观察结果具有不同的长度,并且会发生数据截断。所以我开始编写一个宏,它允许基于输入数据集的动态长度争用,其中字符变量的长度最长。到目前为止,我已经构建了这段代码,非常简单
%Macro Formatting;
proc contents data = engdata.assets2 out = Assets1 noprint;
run;
data Assets2;
set Assets1;
keep NAME LENGTH;
if FORMAT = "$";
run;
proc contents data = historic.assets2016 out = HAssets1 noprint;
run;
data HAssets2;
set HAssets1;
keep NAME LENGTH;
if FORMAT = "$";
run;
proc contents data = engdata.Liabilities2 out = Liabilities1 noprint;
run;
data Liabilities2;
set Liabilities1;
keep NAME LENGTH;
if FORMAT = "$";
run;
proc contents data = historic.Liabilities2016 out = HLiabilities1 noprint;
run;
data HLiabilities2;
set HLiabilities1;
keep NAME LENGTH;
if FORMAT = "$";
run;
proc contents data = engdata.bonds2 out = bonds1 noprint;
run;
data bonds2;
set bonds1;
keep NAME LENGTH;
if FORMAT = "$";
run;
proc contents data = engdata.Irswaps2 out = Irswaps1 noprint;
run;
data Irswaps2;
set Irswaps1;
keep NAME LENGTH;
if FORMAT = "$";
run;
proc contents data = historic.Money_Market_2016 out = MoneyMarket1 noprint;
run;
data MoneyMarket2;
set MoneyMarket1;
keep NAME LENGTH;
if FORMAT = "$";
run;
proc sql;
create table AllLength as
select a.*
,a.Length as Length1
,b.Length as Length2
,c.Length as Length3
,d.Length as Length4
,e.Length as Length5
,f.Length as Length6
,g.Length as Length7
from Liabilities2 as a
left join Assets2 as b
on a.Name = b.Name
left join Bonds2 as c
on a.Name = c.Name
left join Irswaps2 as d
on a.Name = d.Name
left join HLiabilities2 as e
on a.Name = e.Name
left join HAssets2 as f
on a.Name = f.Name
left join MoneyMarket2 as g
on a.Name = g.Name
order by Name;
quit;
data AllLength2;
set AllLength;
array LengthVar Length1-Length7;
largest = max(of LengthVar[*]);
index = whichn(largest, of LengthVar[*]);
Varname = vname(LengthVar[index]);
keep name largest;
run;
proc sql noprint;
select name into: Var1 separated by " " from AllLength2;
select largest into: Var2 separated by " " from AllLength2;
quit;
%put &var1;
%put &var2;
%let index = 1;
%do %until (%Scan(&Var1,&index," ")=);
%let Varr1 = %Scan(&Var1,&index," ");
%let Varr2 = %Scan(&Var2,&index," ");
data engdata.liabilities2;
length &Varr1 $&Varr2..;
set engdata.liabilities2;
format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;
run;
data engdata.assets2;
length &Varr1 $&Varr2..;
set engdata.assets2;
format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;
run;
data engdata.bonds2;
length &Varr1 $&Varr2..;
set engdata.bonds2;
format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;
run;
data engdata.irswaps2;
length &Varr1 $&Varr2..;
set engdata.irswaps2;
format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;
run;
data historic.liabilities2016;
length &Varr1 $&Varr2..;
set historic.liabilities2016;
format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;
run;
data historic.assets2016;
length &Varr1 $&Varr2..;
set historic.assets2016;
format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;
run;
data Historic.moneymarket2016;
length &Varr1 $&Varr2..;
set Historic.moneymarket2016;
format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;
run;
%let index = %eval(&Index + 1);
%end;
%mend;
%Formatting;
有时候,我在寻找格式的变量在某些数据集中不存在,我在日志中得到以下内容
NOTE: There were 0 observations read from the data set HISTORIC.MONEYMARKET2016.
NOTE: The data set HISTORIC.MONEYMARKET2016 has 0 observations and 11 variables.
NOTE: DATA statement used (Total process time):
real time 0.02 seconds
cpu time 0.01 seconds
当我查看数据集时,一切都在那里?我是否会失去任何工作,有没有办法使用这个循环,当变量不存在时,只是跳过它?
答案 0 :(得分:3)
不要单独操作所有可能导致大量磁盘活动的数据集,而应考虑编写构造attrib
语句的代码,该语句是变量属性的同质化。构造的语句将放在数据堆栈SET
语句之前,从而强制pdv使用同质化属性,这意味着所有传入的数据都符合pdv的长度属性,并且不会发生警告。
例如,考虑三个数据集
data one;
s='aaaa';
y=4;
length y 4;
run;
data two;
length s $50;
t = 'for 2';
y = 1.75;
run;
data three;
length s $20;
z = -1;
run;
以均匀的方式堆叠
%big_stack_attack (datasets=
one
two
three,
out=next_big_thing
)
堆叠宏是一个简单的包装器,有一个额外的扭曲,获得使数据集变量均匀化的属性声明。
%macro big_stack_attack(datasets=, out=);
%local attr_code;
%* obtain the attrib statements that homogenize the data;
%homogenize (datasets=&datasets, result=attr_code);
* stack the data, using the attrib statements first to predefine the PDV ;
* into which the SET statement will place values;
data &out;
&attr_code;
set &datasets;
run;
%mend big_stack_attack;
用于构造attrib语句的宏检查数据集的内容,并使用构造的attrib语句的最长长度
%macro homogenize (datasets=, result=);
%* construct attribute statements as the result value
* The statements use the longest length when a variable+type appears
* in more than one dataset
* No checks are done for like named variables of differing types;
%* extract each data set ;
%local i N;
%let i = 1;
%do %while (%length(%scan(&datasets,&i)));
%local data&i;
%let data&i = %scan(&datasets,&i);
%let i = %eval(&i + 1);
%end;
%let N = %eval (&i - 1);
%* get contents of each data set;
%do i = 1 %to &N;
proc contents noprint data=&&data&i out=_contents&i;
run;
%end;
%* construct and concatenate an attrib statement for each variable+type;
proc sql noprint;
select "attrib "
|| trim(name) || " length="
|| case when type=2 then "$" else " " end
|| cats(max(length))
|| case
when type=2 then " format=$" || cats(max(length)) || "."
else " "
end
into
:&result %* NOTE: result parameter is name of macro-var in containing scope;
separated by
';'
from
(
%do i = 1 %to &N;
%if &i > 1 %then UNION;
select * from _contents&i
%end;
)
group by name, type
;
quit;
%mend homogenize;
不同类型的同名变量的情况需要额外的编码和要求(如果字符变量被尝试强制转换为数字类型值,或者应该将数字变量强制转换为字符类型值)