我需要重组数据集以生成交叉表输出/数据集。 。我试图回答的问题是在初次注册购买后购买的其他产品以及产品组合是什么 - 因此,如果客户在注册时购买prod3,他们最终还是购买了prod2和prod4。
我开始的数据集看起来像这样。请注意,某些产品购买日期与启动日期相同
ID Signup_dt Prod_type Purchase_Date
2232 4-Jun-14 prod1 4-Jun-14
2232 4-Jun-14 prod2 4-Jun-14
2232 4-Jun-14 prod3 4-Jun-14
2232 4-Jun-14 prod4
2232 4-Jun-14 prod5 4-Aug-14
4545 12-Jun-14 prod1
4545 12-Jun-14 prod2 13-Sep-14
4545 12-Jun-14 prod3 12-Jun-14
4545 12-Jun-14 prod4 12-Nov-14
4545 12-Jun-14 prod5 12-Jun-14
我需要重组数据集,所以看起来像这样:
ID startup_month Poducts Purchase_at_Start_Up prod1 prod2 prod3 prod4 prod5
2232 June prod1 1 1
2232 June prod2 1 1
2232 June prod3 1 1
2232 June prod4
2232 June prod5
4545 June prod1
4545 June prod2
4545 June prod3 1 1 1
4545 June prod4
4545 June prod5 1 1 1
然后是一个汇总数据集,它给出了产品组合的总数/计数
Startup_month Poducts Purchase_Start_up_count prod1_dt prod2_dt prod3_dt prod4_dt prod5_dt
June prod1 1 1
June prod2 1 1
June prod3 2 1 2
June prod4
June prod5 1 1 1
我无法想象如何为此编程。我一直在使用proc transpose和proc freq的不同组合,但我无法接近所需的输出。非常感谢任何帮助。
答案 0 :(得分:0)
从您的问题中不清楚您希望从第一个汇总表中获得什么(例如,为什么prod5只有前三行而不是前五行中的一个)。也就是说,您可以使用proc转置和数据步骤的组合来构建类似的摘要:
data have;
infile datalines dlm = ",";
input
ID
Signup_dt date9.
Prod_type $
Purchase_Date date9.;
format Signup_dt Purchase_Date yymmdd10.;
datalines;
2232,4-Jun-14,prod1,4-Jun-14
2232,4-Jun-14,prod2,4-Jun-14
2232,4-Jun-14,prod3,4-Jun-14
2232,4-Jun-14,prod4,
2232,4-Jun-14,prod5,4-Aug-14
4545,12-Jun-14,prod1,
4545,12-Jun-14,prod2,13-Sep-14
4545,12-Jun-14,prod3,12-Jun-14
4545,12-Jun-14,prod4,12-Nov-14
4545,12-Jun-14,prod5,12-Jun-14
;
run;
/* Create month variable and check if purchased at signup */
data want1;
set have;
startup_month = strip(put(Signup_dt, monname9.));
if Signup_dt = Purchase_Date then Purchase_at_Start_Up = 1;
drop Purchase_Date Signup_dt;
run;
/* Create dummy flag for transpose*/
data have1;
set have;
if Purchase_Date then flag = 1;
run;
/* Transpose to get product variables by ID */
proc transpose data = have1 out = want2 (drop = _NAME_);
by ID;
id Prod_type;
var flag;
run;
/* Combine the two */
data want;
merge want1 want2;
by ID;
run;
答案 1 :(得分:0)
有趣的问题。
您的摘要表格不正确。 id 4545已经为prod5购买了注册,但没有额外购买该产品。你正在努力实现的目标是相当清楚的。上面的解决方案也是错误的。它加入了对没有任何注册购买的产品的额外销售。
您的第一个表汇总表,按id,startup_month和product;
proc sql;
create table want_sql1 as
select sign_up_purchases.id
,strip(put(sign_up_purchases.Signup_dt, monname9.)) as startup_month
,sign_up_purchases.prod_type as Products
,sum (case when sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date then 1 end)
as Purchase_at_Start_Up
,sum (additional_purchases.prod1) as prod1
,sum (additional_purchases.prod2) as prod2
,sum (additional_purchases.prod3) as prod3
,sum (additional_purchases.prod4) as prod4
,sum (additional_purchases.prod5) as prod5
from have as sign_up_purchases
/* get a summary of additional purchases. */
left join
( select id
,sum (case when prod_type eq 'prod1' then 1 end)
as prod1
,sum (case when prod_type eq 'prod2' then 1 end)
as prod2
,sum (case when prod_type eq 'prod3' then 1 end)
as prod3
,sum (case when prod_type eq 'prod4' then 1 end)
as prod4
,sum (case when prod_type eq 'prod5' then 1 end)
as prod5
from have
where Signup_dt ne Purchase_Date
and Purchase_Date is not null
group by id
) as additional_purchases
/* Join by id and products with sign_up purchases. */
on sign_up_purchases.id eq additional_purchases.id
and sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date
group by 1,2,3
;
quit;
proc compare
base=want_sql1
compare=want
;
run;
您的第二个摘要由startup_month和product;
proc sql;
创建表want_sql2为
select strip(put(sign_up_purchases.Signup_dt, monname9.)) as startup_month
,sign_up_purchases.prod_type as Products
,sum (case when sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date then 1 end)
as Purchase_at_Start_Up
,sum (additional_purchases.prod1_dt) as prod1_dt
,sum (additional_purchases.prod2_dt) as prod2_dt
,sum (additional_purchases.prod3_dt) as prod3_dt
,sum (additional_purchases.prod4_dt) as prod4_dt
,sum (additional_purchases.prod5_dt) as prod5_dt
from have as sign_up_purchases
/* get a summary of additional purchases. */
left join
( select id
,sum (case when prod_type eq 'prod1' then 1 end)
as prod1_dt
,sum (case when prod_type eq 'prod2' then 1 end)
as prod2_dt
,sum (case when prod_type eq 'prod3' then 1 end)
as prod3_dt
,sum (case when prod_type eq 'prod4' then 1 end)
as prod4_dt
,sum (case when prod_type eq 'prod5' then 1 end)
as prod5_dt
from have
where Signup_dt ne Purchase_Date
and Purchase_Date is not null
group by id
) as additional_purchases
/* Join by id and products with sign_up purchases. */
on sign_up_purchases.id eq additional_purchases.id
and sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date
group by 1,2
;
quit;
答案 2 :(得分:0)
这是一个涉及数据步骤,Proc Transpose和Proc Summary的版本,产生2个输出(参见want1和want2)。
与大多数应用程序的情况一样,这比SQL版本要多得多。 SQL更直观,更强大,更灵活。
/* identify additional Purchases. */
data additional_Purchases1;
set have;
/* We need records for all products regardless if there is an */
/* additional Purchase as we want proc transpose to create columns */
/* for all products (prod1-prod5). Not just those that had */
/* additional Purchases. */
if Signup_dt ne Purchase_Date
and Purchase_Date ne .
then additional_Purchases = 1;
run;
/* summarise additional Purchases. */
proc summary nway
data = additional_Purchases1 ;
class id Prod_type;
var additional_Purchases;
output out=additional_Purchases2
sum=additional_Purchases
;
run;
/* Transpose additional Purchases by ID and product*/
proc transpose
data = additional_Purchases2
out = additional_Purchases (drop = _NAME_ )
;
by ID;
id Prod_type;
var additional_Purchases;
run;
/* Join Sign-up purchases to additional purchases for each Id and Product. */
data want1
(keep = id startup_month Products Purchase_at_Start_Up prod1-prod5)
;
format id 10.
startup_month
Products $10.
Purchase_at_Start_Up 10.
;
merge have ( in=sign_up_purchases
rename = (prod_type=Products)
)
additional_Purchases
( in=additional_purchases
rename =(prod1-prod5 = oprod1-oprod5)
)
;
by id;
if sign_up_purchases;
startup_month = strip (put(Signup_dt, monname9.));
/* Tried updating Prod1 - Prod5 (set to missing when no signup purchase) */
/* but re-assignment was retained on next matching observation regardless */
/* of values of prod1-prod5 on input table. */
ARRAY oPROD oprod1-oprod5;
ARRAY PROD prod1-prod5;
/* Get additional Purchases for products with purchased at signup. */
if Signup_dt = Purchase_Date
then do;
Purchase_at_Start_Up = 1;
do over prod;
prod = oprod;
end;
end;
run;
/* Summarise Sign-up and additional Purchases by startup month and product. */
proc summary
nway
data=want1
;
class startup_month products;
var Purchase_at_Start_Up prod1-prod5;
output out = want2 (drop=_type_ _freq_)
sum = Purchase_at_Start_up_count prod1_dt prod2_dt prod3_dt prod4_dt prod5_dt
;
quit;