Base SAS:重组交叉制表数据集

时间:2014-09-29 19:37:09

标签: sas crosstab transpose

我需要重组数据集以生成交叉表输出/数据集。 。我试图回答的问题是在初次注册购买后购买的其他产品以及产品组合是什么 - 因此,如果客户在注册时购买prod3,他们最终还是购买了prod2和prod4。

我开始的数据集看起来像这样。请注意,某些产品购买日期与启动日期相同

ID  Signup_dt   Prod_type   Purchase_Date
2232    4-Jun-14    prod1   4-Jun-14
2232    4-Jun-14    prod2   4-Jun-14
2232    4-Jun-14    prod3   4-Jun-14
2232    4-Jun-14    prod4
2232    4-Jun-14    prod5   4-Aug-14
4545    12-Jun-14   prod1
4545    12-Jun-14   prod2   13-Sep-14
4545    12-Jun-14   prod3   12-Jun-14
4545    12-Jun-14   prod4   12-Nov-14
4545    12-Jun-14   prod5   12-Jun-14

我需要重组数据集,所以看起来像这样:

ID  startup_month   Poducts Purchase_at_Start_Up    prod1   prod2   prod3   prod4   prod5
2232    June         prod1            1                                               1
2232    June         prod2            1                                               1
2232    June         prod3            1                                               1
2232    June         prod4                      
2232    June         prod5                      
4545    June         prod1                      
4545    June         prod2                      
4545    June         prod3            1                      1                        1
4545    June         prod4                      
4545    June         prod5            1                      1                        1

然后是一个汇总数据集,它给出了产品组合的总数/计数

Startup_month   Poducts Purchase_Start_up_count prod1_dt    prod2_dt    prod3_dt    prod4_dt prod5_dt
June              prod1           1                                                             1
June              prod2           1                                                             1
June              prod3           2                1                                            2
June              prod4                     
June              prod5           1                1                                            1

我无法想象如何为此编程。我一直在使用proc transpose和proc freq的不同组合,但我无法接近所需的输出。非常感谢任何帮助。

3 个答案:

答案 0 :(得分:0)

从您的问题中不清楚您希望从第一个汇总表中获得什么(例如,为什么prod5只有前三行而不是前五行中的一个)。也就是说,您可以使用proc转置和数据步骤的组合来构建类似的摘要:

data have;
    infile datalines dlm = ",";
    input 
        ID
        Signup_dt date9.
        Prod_type $
        Purchase_Date date9.;
    format Signup_dt Purchase_Date yymmdd10.;
datalines;
2232,4-Jun-14,prod1,4-Jun-14
2232,4-Jun-14,prod2,4-Jun-14
2232,4-Jun-14,prod3,4-Jun-14
2232,4-Jun-14,prod4,
2232,4-Jun-14,prod5,4-Aug-14
4545,12-Jun-14,prod1,
4545,12-Jun-14,prod2,13-Sep-14
4545,12-Jun-14,prod3,12-Jun-14
4545,12-Jun-14,prod4,12-Nov-14
4545,12-Jun-14,prod5,12-Jun-14
;
run;
/* Create month variable and check if purchased at signup */
data want1;
    set have;
    startup_month = strip(put(Signup_dt, monname9.));
    if Signup_dt = Purchase_Date then Purchase_at_Start_Up = 1;
    drop Purchase_Date Signup_dt;
run;
/* Create dummy flag for transpose*/ 
data have1;
    set have;
    if Purchase_Date then flag = 1;
run;
/* Transpose to get product variables by ID */
proc transpose data = have1 out = want2 (drop =  _NAME_);  
    by ID;
    id Prod_type;
    var flag;
run;
/* Combine the two */
data want;
    merge want1 want2;
    by ID;
run; 

答案 1 :(得分:0)

有趣的问题。

您的摘要表格不正确。 id 4545已经为prod5购买了注册,但没有额外购买该产品。你正在努力实现的目标是相当清楚的。上面的解决方案也是错误的。它加入了对没有任何注册购买的产品的额外销售。

您的第一个表汇总表,按id,startup_month和product;

proc sql;
    create table want_sql1                      as  

    select  sign_up_purchases.id
        ,strip(put(sign_up_purchases.Signup_dt, monname9.)) as startup_month
        ,sign_up_purchases.prod_type                as Products

        ,sum (case when sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date then 1 end) 
                                        as Purchase_at_Start_Up

        ,sum (additional_purchases.prod1)               as prod1    
        ,sum (additional_purchases.prod2)               as prod2    
        ,sum (additional_purchases.prod3)               as prod3    
        ,sum (additional_purchases.prod4)               as prod4    
        ,sum (additional_purchases.prod5)               as prod5    

    from    have    as  sign_up_purchases

/*      get a summary of additional purchases. */
    left    join
        (   select  id
                ,sum (case when prod_type eq 'prod1' then 1 end)
                                        as prod1    

                ,sum (case when prod_type eq 'prod2' then 1 end)
                                        as prod2    

                ,sum (case when prod_type eq 'prod3' then 1 end)
                                        as prod3    

                ,sum (case when prod_type eq 'prod4' then 1 end)
                                        as prod4    

                ,sum (case when prod_type eq 'prod5' then 1 end)
                                        as prod5    

            from    have    

            where   Signup_dt   ne  Purchase_Date 
            and Purchase_Date is    not null

            group   by  id
        )                   as  additional_purchases
/*      Join by id and products with sign_up purchases. */
    on  sign_up_purchases.id        eq  additional_purchases.id
    and sign_up_purchases.Signup_dt eq  sign_up_purchases.Purchase_Date         

    group   by  1,2,3
    ;
quit;   


proc compare 
    base=want_sql1
    compare=want
    ;
run;

您的第二个摘要由startup_month和product;
    proc sql;         创建表want_sql2为

    select strip(put(sign_up_purchases.Signup_dt, monname9.))   as startup_month

        ,sign_up_purchases.prod_type                as Products

        ,sum (case when sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date then 1 end) 
                                        as Purchase_at_Start_Up

        ,sum (additional_purchases.prod1_dt)            as prod1_dt    
        ,sum (additional_purchases.prod2_dt)            as prod2_dt    
        ,sum (additional_purchases.prod3_dt)            as prod3_dt    
        ,sum (additional_purchases.prod4_dt)            as prod4_dt    
        ,sum (additional_purchases.prod5_dt)            as prod5_dt    

    from    have    as  sign_up_purchases

/*      get a summary of additional purchases. */
    left    join
        (   select  id
                ,sum (case when prod_type eq 'prod1' then 1 end)
                                        as prod1_dt    

                ,sum (case when prod_type eq 'prod2' then 1 end)
                                        as prod2_dt    

                ,sum (case when prod_type eq 'prod3' then 1 end)
                                        as prod3_dt    

                ,sum (case when prod_type eq 'prod4' then 1 end)
                                        as prod4_dt    

                ,sum (case when prod_type eq 'prod5' then 1 end)
                                        as prod5_dt    

            from    have    

            where   Signup_dt   ne  Purchase_Date 
            and Purchase_Date is    not null

            group   by  id
        )                   as  additional_purchases
/*      Join by id and products with sign_up purchases. */
    on  sign_up_purchases.id        eq  additional_purchases.id
    and sign_up_purchases.Signup_dt eq  sign_up_purchases.Purchase_Date         

    group   by  1,2
    ;
quit;   

答案 2 :(得分:0)

这是一个涉及数据步骤,Proc Transpose和Proc Summary的版本,产生2个输出(参见want1和want2)。

与大多数应用程序的情况一样,这比SQL版本要多得多。 SQL更直观,更强大,更灵活。

/*  identify additional Purchases. */
data additional_Purchases1;
    set have;

/*  We need records for all products regardless if there is an   */
/*  additional Purchase as we want proc transpose to create columns  */
/*  for all products (prod1-prod5).  Not just those that had     */
/*  additional Purchases.                                            */
    if      Signup_dt   ne  Purchase_Date 
        and Purchase_Date ne    .
        then    additional_Purchases = 1;
run;



/*  summarise additional Purchases. */
proc summary nway 
    data = additional_Purchases1    ;
    class id Prod_type;
    var additional_Purchases;
    output out=additional_Purchases2
        sum=additional_Purchases
        ;
run;



/* Transpose additional Purchases by ID and product*/
proc transpose 
    data = additional_Purchases2 
    out = additional_Purchases (drop =  _NAME_ )
    ;  
    by ID;
    id Prod_type;
    var additional_Purchases;
run;



/*  Join Sign-up purchases to additional purchases for each Id and Product. */
data want1 
    (keep = id startup_month Products Purchase_at_Start_Up prod1-prod5) 
    ;

    format id           10.
        startup_month
        Products        $10.
        Purchase_at_Start_Up 10.
        ;

    merge   have    (   in=sign_up_purchases 
                rename = (prod_type=Products)
            )
        additional_Purchases    
            (   in=additional_purchases 
                rename =(prod1-prod5 = oprod1-oprod5)
            )
        ;
    by id;

    if  sign_up_purchases;

    startup_month = strip (put(Signup_dt, monname9.));


/*  Tried updating Prod1 - Prod5 (set to missing when no signup purchase)  */
/*  but re-assignment was retained on next matching observation regardless */
/*  of values of prod1-prod5 on input table. */

    ARRAY oPROD oprod1-oprod5;
    ARRAY PROD prod1-prod5;

/*  Get additional Purchases for products with purchased at signup.      */
    if  Signup_dt = Purchase_Date 
        then    do;
                Purchase_at_Start_Up = 1;
                do over prod;
                    prod = oprod;
                end;
            end;        
run;


/*  Summarise Sign-up and additional Purchases by startup month and product.     */
proc summary 
    nway
    data=want1
    ;
    class startup_month products;
    var Purchase_at_Start_Up prod1-prod5;
    output  out = want2 (drop=_type_ _freq_)
        sum = Purchase_at_Start_up_count prod1_dt prod2_dt prod3_dt prod4_dt prod5_dt
        ;
quit;