我有2个数据集A和B.我想在A中添加一个变量(列),它从数据集B中获取一个值。例如,我的数据集A是:
Table A
year return
1990 4.5
1991 6.2
1992 3
1993 9.9
Table B
year type value
1992 bond_threshold 10
1992 stock_threshold 15
我想要的新数据集是:
year return bond_threshold stock_threshold
1990 4.5
1991 6.2
1992 3 10 15
1993 9.9
我应该怎么做?我尝试过合并,但它创造了1992年的观察结果:
data want;
merge A B;
by year;
run;
结果如下:
year return type value
1990 4.5
1991 6.2
1992 3 bond_threshold 10
1992 3 stock_threshold 15
1993 9.9
答案 0 :(得分:2)
这里有两个选项,一个是使用PROC TRANSPOSE将数据集转换为宽格式,然后进行合并。第二种是与数据集B合并两次,每次保留感兴趣的变量。
根据您的问题如何缩放,一个比另一个容易。
以下是第二个选项的示例。
data want;
merge a
b (where = (type='bond threshold'))
b (where = (type='stock threshold'));
by Year;
run;
答案 1 :(得分:2)
我使用不同数量的数据进行了一些性能分析。年数为100.000,1.000.000,10.000.000和100.000.000。我还使用了%transpose
宏(http://www.sascommunity.org/mwiki/images/b/be/BB-07-2013.sas),因为它比proc transpose
快。
每一步的cpu时间都是放心的。
结论:对于大型数据集,建议不要使用变体2.所有四次运行的最稳定性能由变体2和3提供。对于非常大的数据集(表a中的> 100.000.000行)变体3的效果更好,因为merge
会比proc sql
更快。
转置宏来了:
%macro transpose(libname_in=,
libname_out=,
data=,
out=,
by=,
prefix=,
var=,
autovars=,
id=,
descendingid=,
var_first=,
format=,
delimiter=,
copy=,
drop=,
sort=,
sort_options=,
use_varname=,
preloadfmt=,
guessingrows=,
newid=);
/*Check whether the data and out parameters contain one or two-level filenames*/
%let lp=%sysfunc(findc(%superq(data),%str(%()));
%if &lp. %then %do;
%let rp=%sysfunc(findc(%superq(data),%str(%)),b));
%let dsoptions=%qsysfunc(substrn(%nrstr(%superq(data)),&lp+1,&rp-&lp-1));
%let data=%sysfunc(substrn(%nrstr(%superq(data)),1,%eval(&lp-1)));
%end;
%else %let dsoptions=;
%if %sysfunc(countw(&data.)) eq 2 %then %do;
%let libname_in=%scan(&data.,1);
%let data=%scan(&data.,2);
%end;
%else %if %length(&libname_in.) eq 0 %then %do;
%let libname_in=work;
%end;
%if %sysfunc(countw(&out.)) eq 2 %then %do;
%let libname_out=%scan(&out.,1);
%let out=%scan(&out.,2);
%end;
%else %if %length(&libname_out.) eq 0 %then %do;
%let libname_out=work;
%end;
%if %length(&newid.) eq 0 %then %do;
%let newid=row;
%end;
/*obtain last by variable*/
%if %length(&by.) gt 0 %then %do;
%let lastby=%scan(&by.,-1);
%end;
%else %do;
%let lastby=;
%end;
/*Create macro variable to contain a list of variables to be copied*/
%let to_copy=;
%if %length(©.) gt 0 %then %do;
data t_e_m_p;
set &libname_in..&data. (obs=1 keep=©.);
run;
proc sql noprint;
select name
into :to_copy separated by " "
from dictionary.columns
where libname="WORK" and
memname="T_E_M_P"
;
quit;
%end;
/*Populate var parameter in the event it has a null value*/
%if %length(&var.) eq 0 %then %do;
data t_e_m_p;
set &libname_in..&data. (obs=1 drop=&by. &id. ©.);
run;
proc sql noprint;
select name
into :var separated by " "
from dictionary.columns
where libname="WORK" and
memname="T_E_M_P"
%if %sysfunc(upcase("&autovars.")) eq "CHAR" %then %do;
and type="char"
%end;
%else %if %sysfunc(upcase("&autovars.")) ne "ALL" %then %do;
and type="num"
%end;
;
quit;
%end;
/*Initialize macro variables*/
%let vars_char=;
%let varlist_char=;
%let vars_num=;
%let varlist_num=;
%let formats_char=;
%let format_char=;
%let formats_num=;
%let format_num=;
/*Create file t_e_m_p to contain one record with all var variables*/
data t_e_m_p;
set &libname_in..&data. (obs=1 keep=&var.);
run;
/*Create macro variables containing untransposed var names and formats*/
proc sql noprint;
select name, case
when missing(format) then " $"||strip(put(length,5.))||'.'
else strip(format)
end
into :vars_char separated by " ",
:formats_char separated by "~"
from dictionary.columns
where libname="WORK" and
memname="T_E_M_P" and
type="char"
;
select name, case
when missing(format) then "best12."
else strip(format)
end
into :vars_num separated by " ",
:formats_num separated by "~"
from dictionary.columns
where libname="WORK" and
memname="T_E_M_P" and
type="num"
;
select name
into :vars_all separated by " "
from dictionary.columns
where libname="WORK" and
memname="T_E_M_P"
;
quit;
/*If sort parameter has a value of YES, create a sorted temporary data file*/
%if %sysfunc(upcase("&sort.")) eq "YES" %then %do;
%let notsorted=;
proc sort data=&libname_in..&data.
(
keep=&by. &id. &vars_char. &vars_num. &to_copy.
&dsoptions.
)
out=t_e_m_p &sort_options. noequals;
by &by.;
run;
%let data=t_e_m_p;
%let libname_in=work;
%end;
%else %do;
%let notsorted=notsorted;
%end;
/*if no id parameter is present, create one from &newid.*/
%if %length(&id.) eq 0 %then %do;
data t_e_m_p;
set &libname_in..&data.;
by &by.;
if first.&lastby then &newid.=1;
else &newid+1;
run;
%let id=&newid.;
%let data=t_e_m_p;
%let libname_in=work;
%end;
/*Ensure guessingrows parameter contains a value*/
%if %length(&guessingrows.) eq 0 %then %do;
%let guessingrows=%sysfunc(constant(EXACTINT));
%end;
/*Ensure a format is assigned to an id variable*/
%if %length(&id.) gt 0 %then %do;
proc sql noprint;
select type,length,%sysfunc(strip(format))
into :tr_macro_type, :tr_macro_len, :tr_macro_format
from dictionary.columns
where libname="%sysfunc(upcase(&libname_in.))" and
memname="%sysfunc(upcase(&data.))" and
upcase(name)="%sysfunc(upcase(&id.))"
;
quit;
%if %length(&format.) eq 0 %then %do;
%let optsave=%sysfunc(getoption(missing),$quote.);
options missing=.;
%if %length(&tr_macro_format.) gt 0 %then %do;
%let format=&tr_macro_format.;
%end;
%else %if "&tr_macro_type." eq "num " %then %do;
%let format=%sysfunc(catt(best,&tr_macro_len.,%str(.)));
%end;
%else %do;
%let format=%sysfunc(catt($,&tr_macro_len.,%str(.)));
%end;
options missing=&optsave;
%end;
%end;
/*Create macro variables containing ordered lists of the requested transposed variable
names for character (varlist_char) and numeric (varlist_num) var variables */
%if %length(&preloadfmt.) gt 0 %then %do;
%if %sysfunc(countw(&preloadfmt.)) eq 1 %then %do;
%let preloadfmt=&libname_in..&preloadfmt.;
%end;
%end;
%else %do;
%if %sysfunc(upcase("&sort.")) eq "YES" %then
%let dsoptions=;
proc freq data=&libname_in..&data. (obs=&guessingrows. keep=&id. &dsoptions.)
noprint;
tables &id./out=_for_format (keep=&id.);
run;
%if %sysfunc(upcase("&descendingid.")) eq "YES" %then %do;
proc sort data=_for_format;
by descending &id;
run;
%end;
data _for_format;
set _for_format;
order=_n_;
run;
%end;
proc sql noprint;
%do i=1 %to 2;
%if &i. eq 1 %then %let i_type=char;
%else %let i_type=num;
%if %length(&&vars_&i_type.) gt 0 %then %do;
select distinct
%do j=1 %to 2;
%if &j. eq 1 %then %let j_type=;
%else %let j_type=format;
%do k=1 %to %sysfunc(countw(&&vars_&i_type.));
"&j_type. "||cats("&prefix.",
%if %sysfunc(upcase("&var_first.")) eq "NO" %then %do;
put(&id.,&format),"&delimiter."
%if %sysfunc(upcase("&use_varname.")) ne "NO" %then
,scan("&&vars_&i_type.",&k.);
%end;
%else %do;
%if %sysfunc(upcase("&use_varname.")) ne "NO" %then
scan("&&vars_&i_type.",&k.),;
"&delimiter.",put(&id.,&format)
%end;
)
%if &j. eq 2 %then
||" "||cats(scan("&&formats_&i_type.",&k.,"~"),";");
%if &k. lt %sysfunc(countw(&&vars_&i_type.)) %then ||;
%else ,;
%end;
%end;
%if "&tr_macro_type." eq "num " %then &id. format=best12.;
%else &id.;
,order
into :varlist_&i_type. separated by " ",
:format_&i_type. separated by " ",
:idlist separated by " ",
:idorder separated by " "
%if %length(&preloadfmt.) gt 0 %then from &preloadfmt.;
%else from _for_format;
order by order
;
%let num_numlabels=&sqlobs.;
%end;
%end;
quit;
proc sql noprint;
select distinct
%let j_type=;
%do k=1 %to %sysfunc(countw(&&vars_all.));
"&j_type. "||cats("&prefix.",
%if %sysfunc(upcase("&var_first.")) eq "NO" %then %do;
put(&id.,&format),"&delimiter.",
%if %sysfunc(upcase("&use_varname.")) ne "NO" %then
scan("&&vars_all.",&k.);
)
%end;
%else %do;
%if %sysfunc(upcase("&use_varname.")) ne "NO" %then
scan("&&vars_all.",&k.),;
"&delimiter.",put(&id.,&format))
%end;
%if &k. lt %sysfunc(countw(&&vars_all.)) %then ||;
%else ,;
%end;
order
into :varlist_all separated by " ",
:idorder separated by " "
%if %length(&preloadfmt.) gt 0 %then from &preloadfmt.;
%else from _for_format;
order by order
;
quit;
/*Create a format that will be used to assign values to the transposed variables*/
data _for_format;
%if %length(&preloadfmt.) gt 0 %then set &preloadfmt. (rename=(&id.=start));
%else set _for_format (rename=(&id.=start));
;
%if "&tr_macro_type." eq "num " %then retain fmtname "labelfmt" type "N";
%else retain fmtname "$labelfmt" type "C";
;
label=
%if %length(&preloadfmt.) eq 0 %then _n_-1;
%else order-1;
;
run;
proc format cntlin = _for_format;
run ;
/*Create and run the datastep that does the transposition*/
data &libname_out..&out.;
set &libname_in..&data. (keep=&by. &id.
%do i=1 %to %sysfunc(countw("&vars_char."));
%scan(&vars_char.,&i.)
%end;
%do i=1 %to %sysfunc(countw("&vars_num."));
%scan(&vars_num.,&i.)
%end;
%do i=1 %to %sysfunc(countw("&to_copy."));
%scan(&to_copy.,&i.)
%end;
&dsoptions.
);
by &by. ¬sorted.;
&format_char. &format_num.
%if %length(&vars_char.) gt 0 %then %do;
array want_char(*) $
%do i=1 %to %eval(&num_numlabels.*%sysfunc(countw("&vars_char.")));
%scan(&varlist_char.,&i.)
%end;
;
array have_char(*) $ &vars_char.;
retain want_char;
if first.&lastby. then call missing(of want_char(*));
___nchar=put(&id.,labelfmt.)*dim(have_char);
do ___i=1 to dim(have_char);
want_char(___nchar+___i)=have_char(___i);
end;
%end;
%if %length(&vars_num.) gt 0 %then %do;
array want_num(*)
%do i=1 %to %eval(&num_numlabels.*%sysfunc(countw("&vars_num.")));
%scan(&varlist_num.,&i.)
%end;
;
array have_num(*) &vars_num.;
retain want_num;
if first.&lastby. then call missing(of want_num(*));
___nnum=put(&id.,labelfmt.)*dim(have_num);
do ___i=1 to dim(have_num);
want_num(___nnum+___i)=have_num(___i);
end;
%end;
drop &id. ___: &var. &drop.;
if last.&lastby. then output;
run;
data &libname_out..&out.;
retain &by. &to_copy. &varlist_all.;
set &libname_out..&out.;
run;
/*Delete all temporary files*/
proc delete data=work.t_e_m_p work._for_format;
run;
%mend transpose;
以下是性能测试的代码:
/*********************************************
PERFORMANCE TEST: PREPARING DATA
*********************************************/
data a;
do year=1 to 100000000;
return=4.5;
output;
end;
run;
data b;
length type $20;
do year=1 to 100000000;
type="bond_threshold";
value=10;
output;
type="stock_threshold";
value=10;
output;
end;
run;
%put ++++++++++ Variant 1 +++++++++++++++++++++++++++++++++++++++++++++++;
%transpose(data=b, out=b2,
by=year, var=value,
id=type
);
proc sql noprint;
CREATE TABLE wanted AS
SELECT a.year
,a.return
,b2.bond_threshold
,b2.stock_threshold
FROM a
LEFT JOIN b2
ON a.year=b2.year
;
quit;
%put +++++++++++ Variant 2 ++++++++++++++++++++++++++++++++++++++++++++++;
proc sort data=a;
by year;
run;
proc sort data=b2;
by year;
run;
data want2;
merge a
b2;
by Year;
run;
%put ++++++++++ Variant 3 +++++++++++++++++++++++++++++++++++++++++++++++;
proc sort data=b;
by year;
run;
data want;
merge a
b (where = (type='bond_threshold'))
b (where = (type='stock_threshold'));
by Year;
run;