如何使用两个变量进行分组

时间:2014-09-24 08:57:09

标签: sql sas

我们有以下数据集:

DATE  VAR1  VAR2
1     A     1
2     A     1
3     B     1
4     C     2
5     D     3
6     E     4
7     F     5
8     B     6
9     B     7
10    D     1

每条记录都属于一个人,问题是一个人可以拥有多条不同值的记录。

识别一个人:如果您共享相同的VAR1,则您是同一个人,但如果您共享相同的VAR2,则您是同一个人。

我的目标是创建一个新的变量IDPERSON,它唯一地标识每条记录的人。在我的例子中,只有4个不同的人:

DATE    VAR1    VAR2    IDPERSON
1       A       1       1
2       A       1       1
3       B       1       1
4       C       2       2
5       D       3       1
6       E       4       3
7       F       5       4
8       B       6       1
9       B       7       1
10      D       1       1

如何使用SQL或SAS实现此目的?

5 个答案:

答案 0 :(得分:1)

%macro grouper(
    inData /*Input dataset*/, 
    outData /*output dataset*/, 
    id1 /*First identification variable (must be numeric)*/, 
    id2 /*Second identification variable*/, 
    idOut /*Name of variable to contain group ID*/, 
    maxN = 5 /*Max number of itterations in case of failure*/);
    /* Assign an ID to each distict connected graph in a a network */

    /* Create first guess for group ID */
    data _g_temp;
        set &inData.;
        &idOut. = &id1.;
    run;

    /* Loop, improve group ID each time*/
    %let i = 1;
    %do %while (&i. <= &maxN.);
        %put Loop number &i.;
        %let i = %eval(&i. + 1);

        proc sql noprint;
            /* Find the lowest group ID for each group of first variable */
            create table _g_map1 as
            select 
                min(&idOut.) as &idOut.,
                &id1.
            from _g_temp
            group by &id1.;

            /* Find the lowest group ID for each group of second variable */
            create table _g_map2 as
            select 
                min(&idOut.) as &idOut.,
                &id2.
            from _g_temp
            group by &id2.;

            /* Find the lowest group ID from both grouping variables */
            create table _g_new as
            select 
                a.&id1.,
                a.&id2.,
                coalesce(min(b.&idOut., c.&idOut.), a.&idOut.) as &idOut.,
                a.&idOut. as &idOut._old
            from _g_temp as a
            full outer join _g_map1 as b
                on a.&id1. = b.&id1.
            full outer join _g_map2 as c
                on a.&id2. = c.&id2.;

            /* Put results into temporary dataset ready for next itteration */
            create table _g_temp as
            select * 
            from _g_new;

            /* Check if the itteration provided any improvement */
            select 
                min(
                    case when &idOut._old = &idOut. then 1 
                    else 0 
                end) into :stopFlag
            from _g_temp;
        quit;

        /* End loop if ID unchanged over last itteration */
        %if &stopFlag. %then %let i = %eval(&maxN. + 1); 
    %end;

    /* Output lookup table */
    proc sql;
        create table &outData. as
        select 
            &id1., 
            min(&idOut.) as &idOut. 
        from _g_temp
        group by &id1.;
    quit;

    /* Clean up */
    proc datasets nolist;
        delete _g_:;
    quit;
%mend grouper;


DATA baseData;
INPUT VAR1 VAR2 $;
CARDS;
1   A
1   A
1   B
2   C
3   D
4   E
5   F
6   B
7   B
1   D
1   X
7   G
6   Y
6   D
6   I
8   D
9   Z
9   X
;
RUN;

%grouper(
    baseData, 
    outData, 
    VAR1, 
    VAR2,
    groupID);

答案 1 :(得分:0)

你觉得这会起作用吗?

它是用SAS编写的,但它使用的是SQL语句。

DATA TEMP3;
    INPUT VAR1 VAR2 $ DATE;
CARDS;
1   A   1
1   A   2
1   B   3
2   C   4
3   D   5
4   E   6
5   F   7
6   B   8
7   B   9
1   D   10
    ;
RUN;

PROC SQL;
    CREATE TABLE WORK.TEMP4 AS SELECT DISTINCT VAR2, VAR1 FROM WORK.TEMP3 ORDER BY VAR2, VAR1;
    CREATE TABLE WORK.TEMP5 AS SELECT DISTINCT VAR1, VAR2 FROM WORK.TEMP3 ORDER BY VAR1, VAR2;
    CREATE TABLE WORK.TEMP6 AS SELECT TEMP4.VAR2, TEMP4.VAR1, TEMP5.VAR2 AS VAR22 FROM WORK.TEMP4 INNER JOIN WORK.TEMP5 ON (TEMP4.VAR1=TEMP5.VAR1);
    CREATE TABLE WORK.TEMP7 AS SELECT TEMP6.*, TEMP5.VAR1 AS VAR12 FROM WORK.TEMP6 INNER JOIN WORK.TEMP5 ON (TEMP6.VAR2=TEMP5.VAR2);
    CREATE TABLE WORK.TEMP8 AS SELECT DISTINCT VAR22, VAR12 FROM WORK.TEMP7 ORDER BY VAR22, VAR12;
    CREATE TABLE WORK.TEMP9 AS SELECT VAR22, MAX(VAR12) AS VAR12 FROM WORK.TEMP8 GROUP BY VAR22;
    CREATE TABLE WORK.TEMP10 AS SELECT TEMP8.* FROM WORK.TEMP8 INNER JOIN WORK.TEMP9 ON (TEMP8.VAR22=TEMP9.VAR22 AND TEMP8.VAR12=TEMP9.VAR12);
    CREATE TABLE WORK.TEMP11 AS SELECT TEMP3.*, TEMP10.VAR12 AS IDPERSONA FROM WORK.TEMP3 LEFT JOIN WORK.TEMP10 ON (TEMP3.VAR2=TEMP10.VAR22);
QUIT;

答案 2 :(得分:0)

我已将此问题分解为几个步骤,这些步骤适用于您提供的数据。可能有一种方法可以减少步骤数,但会牺牲可读性。如果这适用于您的真实数据,请告诉我。

/* create input dataset */
data have;
input DATE  VAR1 $  VAR2;
datalines;
1     A     1
2     A     1
3     B     1
4     C     2
5     D     3
6     E     4
7     F     5
8     B     6
9     B     7
10    D     1
;
run;

/* calculate min VAR2 per VAR1 */
proc summary data=have nway idmin;
class var1;
output out=minvar2 (drop=_:) min(var2)=temp_var;
run;

/* add in min VAR2 data */
proc sql;
create table temp1 as select
a.*,
b.temp_var
from have as a
inner join
minvar2 as b
on a.var1 = b.var1
order by b.temp_var;
quit;

/* create idperson variable */
data want;
set temp1;
by temp_var;
if first.temp_var then idperson+1;
drop temp_var;
run;

/* sort back to original order */
proc sort data=want;
by date var1;
run;

答案 3 :(得分:0)

基思:

您的解决方案无法正常运行,请查看以下数据集:

DATA TEMP3;
INPUT VAR2 VAR1 $ DATE;
DUMMY=1;
CARDS;
1   A   1
1   A   2
1   B   3
2   C   4
3   D   5
4   E   6
5   F   7
6   B   8
7   B   9
1   D   10
1   X   11
7   G   14
6   Y   15
6   D   16
6   I   18
8   D   20
9   Z   21
9   X   22
;
RUN;

您的计划的结果是:

VAR2    VAR1    DATE    DUMMY   idperson
1   A   1   1   1
1   A   2   1   1
1   B   3   1   1
2   C   4   1   2
3   D   5   1   1
4   E   6   1   3
5   F   7   1   4
6   B   8   1   1
7   B   9   1   1
1   D   10  1   1
1   X   11  1   1
7   G   14  1   6
6   Y   15  1   5
6   D   16  1   1
6   I   18  1   5
8   D   20  1   1
9   Z   21  1   7
9   X   22  1   1

由于Var1 = 6记录有两个不同的ID,因此不正确。

这就是我所做的,整个程序(这里没有发布)更复杂(并不那么优雅),因为它处理Var1和Var2中缺少的数据。

PROC SQL;
    CREATE TABLE WORK.TEMP4 AS SELECT DISTINCT VAR1, VAR2 FROM WORK.TEMP3 WHERE DUMMY=1 AND VAR2^=. ORDER BY VAR1, VAR2;
    CREATE TABLE WORK.TEMP5 AS SELECT DISTINCT VAR2, VAR1 FROM WORK.TEMP3 WHERE DUMMY=1 AND VAR2^=. ORDER BY VAR2, VAR1;
    CREATE TABLE WORK.TEMP6 AS SELECT TEMP4.*, TEMP5.VAR1 AS CIP2 FROM WORK.TEMP4 INNER JOIN WORK.TEMP5 ON (TEMP4.VAR2=TEMP5.VAR2);
    CREATE TABLE WORK.TEMP7 AS SELECT TEMP6.*, TEMP4.VAR2 AS IDHH2 FROM WORK.TEMP6 INNER JOIN WORK.TEMP4 ON (TEMP6.VAR1=TEMP4.VAR1);
    CREATE TABLE WORK.TEMP8 AS SELECT DISTINCT IDHH2, CIP2 FROM WORK.TEMP7;
    CREATE TABLE WORK.TEMP9 AS SELECT TEMP7.*, TEMP8.CIP2 AS CIP3 FROM WORK.TEMP7 INNER JOIN WORK.TEMP8 ON (TEMP7.IDHH2=TEMP8.IDHH2);
    CREATE TABLE WORK.TEMP10 AS SELECT TEMP9.*, TEMP8.IDHH2 AS IDHH3 FROM WORK.TEMP9 INNER JOIN WORK.TEMP8 ON (TEMP9.CIP3=TEMP8.CIP2);
    CREATE TABLE WORK.TEMP11 AS SELECT DISTINCT VAR1, IDHH3 AS VAR2 FROM WORK.TEMP10 ORDER BY VAR1, IDHH3;
    CREATE TABLE WORK.TEMP12 AS SELECT VAR1, MAX(VAR2) AS VAR2 FROM WORK.TEMP11 GROUP BY VAR1;
    CREATE TABLE WORK.TEMP13 AS SELECT TEMP11.* FROM WORK.TEMP11 INNER JOIN WORK.TEMP12 ON (TEMP11.VAR1=TEMP12.VAR1 AND TEMP11.VAR2=TEMP12.VAR2);
    CREATE TABLE WORK.TEMP14 AS SELECT TEMP3.*, TEMP13.VAR2 AS IDPERSONA FROM WORK.TEMP3 LEFT JOIN WORK.TEMP13 ON (TEMP3.VAR1=TEMP13.VAR1);
    CREATE TABLE WORK.TEMP15 AS SELECT DISTINCT VAR2, IDPERSONA FROM WORK.TEMP14 WHERE VAR2^=. AND IDPERSONA^=.;
    CREATE TABLE WORK.TEMP16 AS SELECT TEMP14.*, TEMP15.IDPERSONA AS IDPERSONA2 FROM WORK.TEMP14 LEFT JOIN WORK.TEMP15 ON (TEMP14.VAR2=TEMP15.VAR2) ORDER BY DATE;
QUIT;

DATA TEMP16;
    SET TEMP16;
    IF IDPERSONA=. THEN IDPERSONA=IDPERSONA2;
    DROP IDPERSONA2;
RUN;

正确的结果:

VAR2    VAR1    DATE    DUMMY   IDPERSONA
1   A   1   1   9
1   A   2   1   9
1   B   3   1   9
2   C   4   1   2
3   D   5   1   9
4   E   6   1   4
5   F   7   1   5
6   B   8   1   9
7   B   9   1   9
1   D   10  1   9
1   X   11  1   9
7   G   14  1   9
6   Y   15  1   9
6   D   16  1   9
6   I   18  1   9
8   D   20  1   9
9   Z   21  1   9
9   X   22  1   9

答案 4 :(得分:0)

我忘了发布我的最终解决方案,它是一个SAS宏。我为3个变量做了另一个。

%MACRO GROUPER2(INDATA,OUTDATA,ID1,ID2,IDOUT,IDN=_N_,MAXN=5);
    %PUT ****************************************************************;
    %PUT ****************************************************************;
    %PUT ****    GROUPER MACRO;
    %PUT ****    PARAMETERS:;
    %PUT ****        INPUT DATA: &INDATA.;
    %PUT ****        OUTPUT DATA: &OUTDATA.;
    %PUT ****        FIRST VARIABLE: &ID1.;
    %PUT ****        SECOND VARIABLE: &ID2.;
    %PUT ****        OUTPUT GROUPING VARIABLE: &IDOUT.;
    %IF (&IDN.=_N_) %THEN %PUT ****        STARTING NUMBER VARIABLE: AUTONUMBER;
    %ELSE %PUT ****        STARTING NUMBER VARIABLE: &IDN.;
    %PUT ****        MAX ITERATIONS: &MAXN.;
    %PUT ****************************************************************;
    %PUT ****************************************************************;

    /* CREATE FIRST GUESS FOR GROUP ID */
    DATA _G_TEMP1 _G_TEMP2;
        SET &INDATA.;
        &IDOUT.=&IDN.;
        IF &IDOUT.=. THEN OUTPUT _G_TEMP2;
        ELSE OUTPUT _G_TEMP1;
    RUN;
    PROC SQL NOPRINT;
        SELECT MAX(&IDOUT.) INTO :MAXIDOUT FROM _G_TEMP1;
    QUIT;
    DATA _G_TEMP2;
        SET _G_TEMP2;
        &IDOUT.=_N_+&MAXIDOUT.;
    RUN;
    DATA _G_TEMP;
        SET _G_TEMP1 _G_TEMP2;
    RUN;

    PROC SQL;
        UPDATE _G_TEMP SET &IDOUT.=. WHERE &ID1. IS NULL AND &ID2. IS NULL;
    QUIT;

    /* LOOP, IMPROVE GROUP ID EACH TIME*/
    %LET I = 1;
    %DO %WHILE (&I. <= &MAXN.);
        %PUT LOOP NUMBER &I.;
        %LET I = %EVAL(&I. + 1);

        PROC SQL NOPRINT;
            /* FIND THE LOWEST GROUP ID FOR EACH GROUP OF FIRST VARIABLE */
            CREATE TABLE _G_MAP1 AS SELECT MIN(&IDOUT.) AS &IDOUT., &ID1. FROM _G_TEMP WHERE &ID1. IS NOT NULL GROUP BY &ID1.;
            /* FIND THE LOWEST GROUP ID FOR EACH GROUP OF SECOND VARIABLE */
            CREATE TABLE _G_MAP2 AS SELECT MIN(&IDOUT.) AS &IDOUT., &ID2. FROM _G_TEMP WHERE &ID2. IS NOT NULL GROUP BY &ID2.;
            /* FIND THE LOWEST GROUP ID FROM BOTH GROUPING VARIABLES */
            CREATE TABLE _G_NEW AS SELECT A.&ID1., A.&ID2., COALESCE(MIN(B.&IDOUT., C.&IDOUT.), A.&IDOUT.) AS &IDOUT.,
                A.&IDOUT. AS &IDOUT._OLD FROM _G_TEMP AS A FULL OUTER JOIN _G_MAP1 AS B ON A.&ID1. = B.&ID1.
                FULL OUTER JOIN _G_MAP2 AS C ON A.&ID2. = C.&ID2.;
            /* PUT RESULTS INTO TEMPORARY DATASET READY FOR NEXT ITTERATION */
            CREATE TABLE _G_TEMP AS SELECT * FROM _G_NEW ORDER BY &ID1., &ID2.;
            /* CHECK IF THE ITTERATION PROVIDED ANY IMPROVEMENT */
            SELECT MIN(CASE WHEN &IDOUT._OLD = &IDOUT. THEN 1 ELSE 0 END) INTO :STOPFLAG FROM _G_TEMP;
            %PUT NO IMPROVEMENT? &STOPFLAG.;
        QUIT;
        /* END LOOP IF ID UNCHANGED OVER LAST ITTERATION */
        %LET ITERATIONS=%EVAL(&I. - 1);
        %IF &STOPFLAG. %THEN %LET I = %EVAL(&MAXN. + 1); 
    %END;

    %PUT ****************************************************************;
    %PUT ****************************************************************;
    %IF &STOPFLAG. %THEN %PUT ****    LOOPING ENDED BY NO-IMPROVEMENT CRITERIA. OUTPUT FULLY GROUPED.; 
    %ELSE %PUT ****    WARNING: LOOPING ENDED BY REACHING THE MAXIMUM NUMBER OF ITERARIONS. OUTPUT NOT FULLY GROUPED.;
    %PUT ****    NUMBER OF ITERATIONS: &ITERATIONS. (MAX: &MAXN.);
    %PUT ****************************************************************;
    %PUT ****************************************************************;

    DATA &OUTDATA.;
        SET _G_TEMP;
        DROP &IDOUT._OLD;
    RUN;
    /* OUTPUT LOOKUP TABLE */
    PROC SQL;
        CREATE TABLE &OUTDATA._1 AS SELECT &ID1., MIN(&IDOUT.) AS &IDOUT. FROM _G_TEMP WHERE &ID1. IS NOT NULL GROUP BY &ID1. ORDER BY &ID1.;
        CREATE TABLE &OUTDATA._2 AS SELECT &ID2., MIN(&IDOUT.) AS &IDOUT. FROM _G_TEMP WHERE &ID2. IS NOT NULL GROUP BY &ID2. ORDER BY &ID2.;
    QUIT; 

    /* CLEAN UP */
    PROC DATASETS NOLIST;
        DELETE _G_:;
    QUIT;
%MEND GROUPER2;