我有一个看起来像这样的数据集:
IDnum State Product Consumption
123 MI A 30
123 MI B 20
123 MI C 45
456 NJ A 15
456 NJ D 10
789 MI B 60
... ... ... ...
我想创建一个新的数据集,其中每个IDnum都有一行,每个不同的产品都有一个新的虚拟变量(在我的真实数据集中我有近1000个产品),以及它的相关消费。它看起来像是这些行中的东西
IDnum State Prod.A Cons.A Prod.B Cons.B Prod.C Cons.C Prod.D Cons.D
123 MI yes 30 yes 20 yes 45 no -
456 NJ yes 15 no - no - yes 10
789 MI no - yes 60 no - no -
... ... ... ... ... ... ... ... ... ...
某些变量(如“状态”)不会在同一个IDnum内发生变化,但原始银行中的每一行都相当于一次购买,因此相同IDnum的“产品”和“消费”变量发生了变化。我想我的新数据集在一行中显示了每个客户的所有消费习惯,但到目前为止我都失败了。
任何帮助都会有很大帮助。
答案 0 :(得分:1)
没有是/否变量,这很容易:
data input;
length State $2 Product $1;
input IDnum State Product Consumption;
cards;
123 MI A 30
123 MI B 20
123 MI C 45
456 NJ A 15
456 NJ D 10
789 MI B 60
;
run;
proc transpose data=input out=output(drop=_NAME_) prefix=Cons_;
var Consumption;
id Product;
by IDnum State;
run;
添加是/否字段:
proc sql;/* from column names or alternatively
create it from source data directly if not taking too long */
create table work.products as
select scan(name, 2, '_') as product length=1
from dictionary.columns
where libname='WORK' and memname='OUTPUT'
and upcase(name) like 'CONS_%';
quit;
filename vars temp;/* write a temp file containing variable definitions
in desired order */
data _null_;
set work.products end=last;
file vars;
length str $40;
if _N_ = 1 then put 'LENGTH ';
str = catt('Prod_', product, ' $3');
put str;
str = catt('Cons_', product, ' 8');
put str;
if last then put ';';
run;
options source2;
data output2;
length IdNum 8 State $2;
%include vars;
set output;
array prod{*} Prod_:;
array cons{*} Cons_:;
drop i;
do i=1 to dim(prod);
if coalesce(cons(i), 0) ne 0 then prod(i)='yes';
else prod(i)='no';
end;
run;