我想计算按其他列分组的累计最大值。
说我有这些数据:
data have;
input grp $ number;
datalines;
a 3
b 4
a 5
b 2
a 1
b 8
;
我想要的输出是:
data want;
input grp $ cummax;
a 3
b 4
a 5
b 4
a 5
b 8
;
我的实际情况将涉及多个分组列+过滤器,理想情况下,此累积最大值将同时在多个列上计算。
我主要担心的是计算效率,因为我会在十亿到几亿行的表上运行它。 Proc SQL或原生SAS都是受欢迎的。
如有必要,行可能会被洗牌。
系统信息
proc product_status;run;
For Base SAS Software ... Custom version information: 9.3_M2 Image version information: 9.03.01M2P080112 For SAS/STAT ... Custom version information: 12.1 Image version information: 9.03.01M0P081512 For SAS/GRAPH ... Custom version information: 9.3_M2 For SAS/CONNECT ... Custom version information: 9.3_M2 For SAS OLAP Server ... Custom version information: 9.3_M1 For SAS Enterprise Miner ... Custom version information: 12.1 Image version information: 9.03.01M0P081512 For SAS Integration Technologies ... Custom version information: 9.3_M2 For SAS/ACCESS Interface to Oracle ... Custom version information: 9.3_M1 For SAS/ACCESS Interface to PC Files ... Custom version information: 9.3_M2
proc setinit;run;
Product expiration dates: ---Base SAS Software 31JUL2018 ---SAS/STAT 31JUL2018 ---SAS/GRAPH 31JUL2018 ---SAS/CONNECT 31JUL2018 ---SAS OLAP Server 31JUL2018 ---SAS Enterprise Miner 31JUL2018 ---MDDB Server common products 31JUL2018 ---SAS Integration Technologies 31JUL2018 ---SAS Enterprise Miner Server 31JUL2018 ---SAS Enterprise Miner Client 31JUL2018 ---Unused OLAP Slot 31JUL2018 ---SAS Enterprise Guide 31JUL2018 ---SAS/ACCESS Interface to Oracle 31JUL2018 ---SAS/ACCESS Interface to PC Files 31JUL2018 ---SAS Metadata Bridges for Informatica 31JUL2018 ---SAS Metadata Bridges for Microsoft SQL Server 31JUL2018 ---SAS Metadata Bridge for Oracle 31JUL2018 ---SAS Workspace Server for Local Access 31JUL2018 ---SAS Workspace Server for Enterprise Access 31JUL2018 ---SAS Table Server 31JUL2018 ---DataFlux Trans DB Driver 31JUL2018 ---SAS Framework Data Server 31JUL2018 ---SAS Add-in for Microsoft Excel 31JUL2018 ---SAS Add-in for Microsoft Outlook 31JUL2018 ---SAS Add-in for Microsoft PowerPoint 31JUL2018 ---SAS Add-in for Microsoft Word 31JUL2018
答案 0 :(得分:2)
proc sort data=have;
by grp;
run;
data want;
set have;
by grp;
retain max;
max=ifn(first.grp,number,max(number,max));
run;
使用不带排序的哈希
data want;
if _n_=1 then do;
declare hash h();
h.definekey('grp');
h.definedata('value');
h.definedone();
end;
set have;
if h.find()^=0 then do;
h.add(key:grp,data:number);
max=number;
end;
else do;
max=max(number,value);
h.replace(key:grp,data:number);
end;
drop value number;
run;
答案 1 :(得分:2)
使用init<U:Cacheable>(_ cacheable:U) where U.CacheType == T {
self._encode = cacheable.encode
self._decode = cacheable.decode
}
对象存储每个变量和组合组合的最大值。这将允许您单次传递数据集并编写可以根据组和变量的数量进行扩展的代码。
这不需要在大型数据集上成本高昂的排序。
测试数据
HASH
数据步骤计算累积最大值
data example;
format grp1-grp5 $1.;
array grp[5];
array val[5];
do rows=1 to 1000000;
do i=1 to 5;
r = ceil(ranuni(1)*5);
grp[i] = substr("ABCDE",r,1);
end;
do j=1 to 5;
val[j] = 10*rannor(1);
end;
output;
end;
keep grp: val:;
run;
使用该测试变量data want;
set example;
array val[5];
array max[5];
if _n_ = 1 then do;
declare hash mx();
rc = mx.defineKey('grp1','grp2','grp3','grp4','grp5');
rc = mx.definedata('max1','max2','max3','max4','max5');
rc = mx.definedone();
end;
rc = mx.find();
/*No Max for this combination -- add it*/
if rc then do;
do i=1 to 5;
max[i] = val[i];
end;
end;
/*Update Max Values*/
do i=1 to 5;
if val[i] > max[i] then
max[i] = val[i];
end;
/*Update Hash*/
rc = mx.replace();
drop rc i;
n = _n_; /*This is for testing*/
run;
,我们可以对保持原始顺序的组进行排序,看看它是否有效。 (提示,确实如此)。
n
答案 2 :(得分:1)
以下内容将起作用。如果您想保留原始订单,请添加一个行计数器并依据:
{'002_S_0559': array([ 0., 0., 0., ..., 0., 0., 0.],dtype=float32)}
{'002_S_1070': array([ 0., 0., 0., ..., 0., 0., 0.], dtype=float32)}
{'023_S_0604': array([ 0., 0., 0., ..., 0., 0., 0.], dtype=float32)}
答案 3 :(得分:1)
我构建了一个围绕@DomPazz解决方案的宏功能,可以选择要分组的列,要计算的列以及要删除或保留的列。
我认为包含的例子很简单。
我在底部加入了我在cummax
中使用的简短便捷宏函数。
*------------------------------------------------------------;
* CUMMAX ;
* Compute a cumulative max on 1 or several variables grouped ;
* by one or several variables; ;
*------------------------------------------------------------;
/* EXAMPLE:
data have;
format grp1-grp2 $1.;
array grp[2];
array val[3];
do rows=1 to 20;
do i=1 to 2;
r = ceil(ranuni(1)*2);
grp[i] = substr("AB",r,1);
end;
do j=1 to 3;
val[j] = 10*rannor(1);
end;
output;
end;
keep grp: val:;
run;
%cummax(have,grp=grp1 grp2,val=val1 val2,out= want1)
%cummax(have,grp=grp1,val=val1,drop=grp2 val3,out= want2)
%cummax(have,grp=grp1,val=val1,keep= val2,out= want3)
*/
%macro cummax
(data /* source table */
,grp= /* variables to group on */
,val= /* variables to compute on */
,keep= /* variables to keep additionally to grp and computed columns, don't use with drop */
,drop= /* variables to drop, don't use with keep */
,out= /* output table */
);
/* default output */
%if not %length(&out) %then %let out = &data;
/* rework keep and drop */
%local n_val max_val;
%let n_val = %list_length(&val);
%let max_val = %list_fix(&val,suffix=_cmax);
%if %length(&keep) %then %let keep = (keep= &keep &grp &max_val );
%if %length(&drop) %then %let drop = (drop= &drop);
/* data step */
data &out&keep&drop;
set &data;
array val[&n_val] &val;
array max[&n_val] &max_val;
if _n_ = 1 then do;
declare hash mx();
rc = mx.defineKey(%list_quote_comma(&grp));
rc = mx.definedata(%list_quote_comma(&max_val));
rc = mx.definedone();
end;
rc = mx.find();
/*No Max for this combination -- add it*/
if rc then do;
do i=1 to &n_val; /* %list_length(&val) */
max[i] = val[i];
end;
end;
/*Update Max Values*/
do i=1 to &n_val;
if val[i] > max[i] then
max[i] = val[i];
end;
/*Update Hash*/
rc = mx.replace();
drop rc i;
run;
%mend;
*---------------------------------------------------------------;
* LIST_LENGTH ;
* Length of space separated list ;
*---------------------------------------------------------------;
/* EXAMPLES :
%put %list_length(item1 item2 item3);
*/
%macro list_length
(data
);
%sysfunc(countw(&data,%str( )))
%mend;
*---------------------------------------------------------------;
* LIST_QUOTE_COMMA ;
* create comma separated list with quoted items, from ;
* unquoted space separated list. ;
*---------------------------------------------------------------;
/* EXAMPLE
%put %list_quote_comma(a b c);
*/
%macro list_quote_comma
(data /* space separated list to quote */
);
%unquote(%str(%')%qsysfunc(tranwrd(&data,%str( ),%str(%',%')))%str(%'))
%mend;
*---------------------------------------------------------------;
* LIST_FIX ;
* Add prefix and/or suffix to items of space separated list ;
*---------------------------------------------------------------;
/* EXAMPLES :
%put %list_fix(item1 item2 item3,pref_,_suf);
%put %list_fix(item1 item2 item3,pref_);
%put %list_fix(item1 item2 item3,suffix=_suf);
*/
%macro list_fix
(data
,prefix
,suffix
);
%local output;
%do i=1 %to %sysfunc(countw(&data,%str( ))) ;
%let output= &output &prefix.%scan(&data,&i,%str( ))&suffix;
%end;
&output
%mend;