我们有以下数据集:
DATE VAR1 VAR2
1 A 1
2 A 1
3 B 1
4 C 2
5 D 3
6 E 4
7 F 5
8 B 6
9 B 7
10 D 1
每条记录都属于一个人,问题是一个人可以拥有多条不同值的记录。
识别一个人:如果您共享相同的VAR1,则您是同一个人,但如果您共享相同的VAR2,则您是同一个人。
我的目标是创建一个新的变量IDPERSON,它唯一地标识每条记录的人。在我的例子中,只有4个不同的人:
DATE VAR1 VAR2 IDPERSON
1 A 1 1
2 A 1 1
3 B 1 1
4 C 2 2
5 D 3 1
6 E 4 3
7 F 5 4
8 B 6 1
9 B 7 1
10 D 1 1
如何使用SQL或SAS实现此目的?
答案 0 :(得分:1)
%macro grouper(
inData /*Input dataset*/,
outData /*output dataset*/,
id1 /*First identification variable (must be numeric)*/,
id2 /*Second identification variable*/,
idOut /*Name of variable to contain group ID*/,
maxN = 5 /*Max number of itterations in case of failure*/);
/* Assign an ID to each distict connected graph in a a network */
/* Create first guess for group ID */
data _g_temp;
set &inData.;
&idOut. = &id1.;
run;
/* Loop, improve group ID each time*/
%let i = 1;
%do %while (&i. <= &maxN.);
%put Loop number &i.;
%let i = %eval(&i. + 1);
proc sql noprint;
/* Find the lowest group ID for each group of first variable */
create table _g_map1 as
select
min(&idOut.) as &idOut.,
&id1.
from _g_temp
group by &id1.;
/* Find the lowest group ID for each group of second variable */
create table _g_map2 as
select
min(&idOut.) as &idOut.,
&id2.
from _g_temp
group by &id2.;
/* Find the lowest group ID from both grouping variables */
create table _g_new as
select
a.&id1.,
a.&id2.,
coalesce(min(b.&idOut., c.&idOut.), a.&idOut.) as &idOut.,
a.&idOut. as &idOut._old
from _g_temp as a
full outer join _g_map1 as b
on a.&id1. = b.&id1.
full outer join _g_map2 as c
on a.&id2. = c.&id2.;
/* Put results into temporary dataset ready for next itteration */
create table _g_temp as
select *
from _g_new;
/* Check if the itteration provided any improvement */
select
min(
case when &idOut._old = &idOut. then 1
else 0
end) into :stopFlag
from _g_temp;
quit;
/* End loop if ID unchanged over last itteration */
%if &stopFlag. %then %let i = %eval(&maxN. + 1);
%end;
/* Output lookup table */
proc sql;
create table &outData. as
select
&id1.,
min(&idOut.) as &idOut.
from _g_temp
group by &id1.;
quit;
/* Clean up */
proc datasets nolist;
delete _g_:;
quit;
%mend grouper;
DATA baseData;
INPUT VAR1 VAR2 $;
CARDS;
1 A
1 A
1 B
2 C
3 D
4 E
5 F
6 B
7 B
1 D
1 X
7 G
6 Y
6 D
6 I
8 D
9 Z
9 X
;
RUN;
%grouper(
baseData,
outData,
VAR1,
VAR2,
groupID);
答案 1 :(得分:0)
你觉得这会起作用吗?
它是用SAS编写的,但它使用的是SQL语句。
DATA TEMP3;
INPUT VAR1 VAR2 $ DATE;
CARDS;
1 A 1
1 A 2
1 B 3
2 C 4
3 D 5
4 E 6
5 F 7
6 B 8
7 B 9
1 D 10
;
RUN;
PROC SQL;
CREATE TABLE WORK.TEMP4 AS SELECT DISTINCT VAR2, VAR1 FROM WORK.TEMP3 ORDER BY VAR2, VAR1;
CREATE TABLE WORK.TEMP5 AS SELECT DISTINCT VAR1, VAR2 FROM WORK.TEMP3 ORDER BY VAR1, VAR2;
CREATE TABLE WORK.TEMP6 AS SELECT TEMP4.VAR2, TEMP4.VAR1, TEMP5.VAR2 AS VAR22 FROM WORK.TEMP4 INNER JOIN WORK.TEMP5 ON (TEMP4.VAR1=TEMP5.VAR1);
CREATE TABLE WORK.TEMP7 AS SELECT TEMP6.*, TEMP5.VAR1 AS VAR12 FROM WORK.TEMP6 INNER JOIN WORK.TEMP5 ON (TEMP6.VAR2=TEMP5.VAR2);
CREATE TABLE WORK.TEMP8 AS SELECT DISTINCT VAR22, VAR12 FROM WORK.TEMP7 ORDER BY VAR22, VAR12;
CREATE TABLE WORK.TEMP9 AS SELECT VAR22, MAX(VAR12) AS VAR12 FROM WORK.TEMP8 GROUP BY VAR22;
CREATE TABLE WORK.TEMP10 AS SELECT TEMP8.* FROM WORK.TEMP8 INNER JOIN WORK.TEMP9 ON (TEMP8.VAR22=TEMP9.VAR22 AND TEMP8.VAR12=TEMP9.VAR12);
CREATE TABLE WORK.TEMP11 AS SELECT TEMP3.*, TEMP10.VAR12 AS IDPERSONA FROM WORK.TEMP3 LEFT JOIN WORK.TEMP10 ON (TEMP3.VAR2=TEMP10.VAR22);
QUIT;
答案 2 :(得分:0)
我已将此问题分解为几个步骤,这些步骤适用于您提供的数据。可能有一种方法可以减少步骤数,但会牺牲可读性。如果这适用于您的真实数据,请告诉我。
/* create input dataset */
data have;
input DATE VAR1 $ VAR2;
datalines;
1 A 1
2 A 1
3 B 1
4 C 2
5 D 3
6 E 4
7 F 5
8 B 6
9 B 7
10 D 1
;
run;
/* calculate min VAR2 per VAR1 */
proc summary data=have nway idmin;
class var1;
output out=minvar2 (drop=_:) min(var2)=temp_var;
run;
/* add in min VAR2 data */
proc sql;
create table temp1 as select
a.*,
b.temp_var
from have as a
inner join
minvar2 as b
on a.var1 = b.var1
order by b.temp_var;
quit;
/* create idperson variable */
data want;
set temp1;
by temp_var;
if first.temp_var then idperson+1;
drop temp_var;
run;
/* sort back to original order */
proc sort data=want;
by date var1;
run;
答案 3 :(得分:0)
基思:
您的解决方案无法正常运行,请查看以下数据集:
DATA TEMP3;
INPUT VAR2 VAR1 $ DATE;
DUMMY=1;
CARDS;
1 A 1
1 A 2
1 B 3
2 C 4
3 D 5
4 E 6
5 F 7
6 B 8
7 B 9
1 D 10
1 X 11
7 G 14
6 Y 15
6 D 16
6 I 18
8 D 20
9 Z 21
9 X 22
;
RUN;
您的计划的结果是:
VAR2 VAR1 DATE DUMMY idperson
1 A 1 1 1
1 A 2 1 1
1 B 3 1 1
2 C 4 1 2
3 D 5 1 1
4 E 6 1 3
5 F 7 1 4
6 B 8 1 1
7 B 9 1 1
1 D 10 1 1
1 X 11 1 1
7 G 14 1 6
6 Y 15 1 5
6 D 16 1 1
6 I 18 1 5
8 D 20 1 1
9 Z 21 1 7
9 X 22 1 1
由于Var1 = 6记录有两个不同的ID,因此不正确。
这就是我所做的,整个程序(这里没有发布)更复杂(并不那么优雅),因为它处理Var1和Var2中缺少的数据。
PROC SQL;
CREATE TABLE WORK.TEMP4 AS SELECT DISTINCT VAR1, VAR2 FROM WORK.TEMP3 WHERE DUMMY=1 AND VAR2^=. ORDER BY VAR1, VAR2;
CREATE TABLE WORK.TEMP5 AS SELECT DISTINCT VAR2, VAR1 FROM WORK.TEMP3 WHERE DUMMY=1 AND VAR2^=. ORDER BY VAR2, VAR1;
CREATE TABLE WORK.TEMP6 AS SELECT TEMP4.*, TEMP5.VAR1 AS CIP2 FROM WORK.TEMP4 INNER JOIN WORK.TEMP5 ON (TEMP4.VAR2=TEMP5.VAR2);
CREATE TABLE WORK.TEMP7 AS SELECT TEMP6.*, TEMP4.VAR2 AS IDHH2 FROM WORK.TEMP6 INNER JOIN WORK.TEMP4 ON (TEMP6.VAR1=TEMP4.VAR1);
CREATE TABLE WORK.TEMP8 AS SELECT DISTINCT IDHH2, CIP2 FROM WORK.TEMP7;
CREATE TABLE WORK.TEMP9 AS SELECT TEMP7.*, TEMP8.CIP2 AS CIP3 FROM WORK.TEMP7 INNER JOIN WORK.TEMP8 ON (TEMP7.IDHH2=TEMP8.IDHH2);
CREATE TABLE WORK.TEMP10 AS SELECT TEMP9.*, TEMP8.IDHH2 AS IDHH3 FROM WORK.TEMP9 INNER JOIN WORK.TEMP8 ON (TEMP9.CIP3=TEMP8.CIP2);
CREATE TABLE WORK.TEMP11 AS SELECT DISTINCT VAR1, IDHH3 AS VAR2 FROM WORK.TEMP10 ORDER BY VAR1, IDHH3;
CREATE TABLE WORK.TEMP12 AS SELECT VAR1, MAX(VAR2) AS VAR2 FROM WORK.TEMP11 GROUP BY VAR1;
CREATE TABLE WORK.TEMP13 AS SELECT TEMP11.* FROM WORK.TEMP11 INNER JOIN WORK.TEMP12 ON (TEMP11.VAR1=TEMP12.VAR1 AND TEMP11.VAR2=TEMP12.VAR2);
CREATE TABLE WORK.TEMP14 AS SELECT TEMP3.*, TEMP13.VAR2 AS IDPERSONA FROM WORK.TEMP3 LEFT JOIN WORK.TEMP13 ON (TEMP3.VAR1=TEMP13.VAR1);
CREATE TABLE WORK.TEMP15 AS SELECT DISTINCT VAR2, IDPERSONA FROM WORK.TEMP14 WHERE VAR2^=. AND IDPERSONA^=.;
CREATE TABLE WORK.TEMP16 AS SELECT TEMP14.*, TEMP15.IDPERSONA AS IDPERSONA2 FROM WORK.TEMP14 LEFT JOIN WORK.TEMP15 ON (TEMP14.VAR2=TEMP15.VAR2) ORDER BY DATE;
QUIT;
DATA TEMP16;
SET TEMP16;
IF IDPERSONA=. THEN IDPERSONA=IDPERSONA2;
DROP IDPERSONA2;
RUN;
正确的结果:
VAR2 VAR1 DATE DUMMY IDPERSONA
1 A 1 1 9
1 A 2 1 9
1 B 3 1 9
2 C 4 1 2
3 D 5 1 9
4 E 6 1 4
5 F 7 1 5
6 B 8 1 9
7 B 9 1 9
1 D 10 1 9
1 X 11 1 9
7 G 14 1 9
6 Y 15 1 9
6 D 16 1 9
6 I 18 1 9
8 D 20 1 9
9 Z 21 1 9
9 X 22 1 9
答案 4 :(得分:0)
我忘了发布我的最终解决方案,它是一个SAS宏。我为3个变量做了另一个。
%MACRO GROUPER2(INDATA,OUTDATA,ID1,ID2,IDOUT,IDN=_N_,MAXN=5);
%PUT ****************************************************************;
%PUT ****************************************************************;
%PUT **** GROUPER MACRO;
%PUT **** PARAMETERS:;
%PUT **** INPUT DATA: &INDATA.;
%PUT **** OUTPUT DATA: &OUTDATA.;
%PUT **** FIRST VARIABLE: &ID1.;
%PUT **** SECOND VARIABLE: &ID2.;
%PUT **** OUTPUT GROUPING VARIABLE: &IDOUT.;
%IF (&IDN.=_N_) %THEN %PUT **** STARTING NUMBER VARIABLE: AUTONUMBER;
%ELSE %PUT **** STARTING NUMBER VARIABLE: &IDN.;
%PUT **** MAX ITERATIONS: &MAXN.;
%PUT ****************************************************************;
%PUT ****************************************************************;
/* CREATE FIRST GUESS FOR GROUP ID */
DATA _G_TEMP1 _G_TEMP2;
SET &INDATA.;
&IDOUT.=&IDN.;
IF &IDOUT.=. THEN OUTPUT _G_TEMP2;
ELSE OUTPUT _G_TEMP1;
RUN;
PROC SQL NOPRINT;
SELECT MAX(&IDOUT.) INTO :MAXIDOUT FROM _G_TEMP1;
QUIT;
DATA _G_TEMP2;
SET _G_TEMP2;
&IDOUT.=_N_+&MAXIDOUT.;
RUN;
DATA _G_TEMP;
SET _G_TEMP1 _G_TEMP2;
RUN;
PROC SQL;
UPDATE _G_TEMP SET &IDOUT.=. WHERE &ID1. IS NULL AND &ID2. IS NULL;
QUIT;
/* LOOP, IMPROVE GROUP ID EACH TIME*/
%LET I = 1;
%DO %WHILE (&I. <= &MAXN.);
%PUT LOOP NUMBER &I.;
%LET I = %EVAL(&I. + 1);
PROC SQL NOPRINT;
/* FIND THE LOWEST GROUP ID FOR EACH GROUP OF FIRST VARIABLE */
CREATE TABLE _G_MAP1 AS SELECT MIN(&IDOUT.) AS &IDOUT., &ID1. FROM _G_TEMP WHERE &ID1. IS NOT NULL GROUP BY &ID1.;
/* FIND THE LOWEST GROUP ID FOR EACH GROUP OF SECOND VARIABLE */
CREATE TABLE _G_MAP2 AS SELECT MIN(&IDOUT.) AS &IDOUT., &ID2. FROM _G_TEMP WHERE &ID2. IS NOT NULL GROUP BY &ID2.;
/* FIND THE LOWEST GROUP ID FROM BOTH GROUPING VARIABLES */
CREATE TABLE _G_NEW AS SELECT A.&ID1., A.&ID2., COALESCE(MIN(B.&IDOUT., C.&IDOUT.), A.&IDOUT.) AS &IDOUT.,
A.&IDOUT. AS &IDOUT._OLD FROM _G_TEMP AS A FULL OUTER JOIN _G_MAP1 AS B ON A.&ID1. = B.&ID1.
FULL OUTER JOIN _G_MAP2 AS C ON A.&ID2. = C.&ID2.;
/* PUT RESULTS INTO TEMPORARY DATASET READY FOR NEXT ITTERATION */
CREATE TABLE _G_TEMP AS SELECT * FROM _G_NEW ORDER BY &ID1., &ID2.;
/* CHECK IF THE ITTERATION PROVIDED ANY IMPROVEMENT */
SELECT MIN(CASE WHEN &IDOUT._OLD = &IDOUT. THEN 1 ELSE 0 END) INTO :STOPFLAG FROM _G_TEMP;
%PUT NO IMPROVEMENT? &STOPFLAG.;
QUIT;
/* END LOOP IF ID UNCHANGED OVER LAST ITTERATION */
%LET ITERATIONS=%EVAL(&I. - 1);
%IF &STOPFLAG. %THEN %LET I = %EVAL(&MAXN. + 1);
%END;
%PUT ****************************************************************;
%PUT ****************************************************************;
%IF &STOPFLAG. %THEN %PUT **** LOOPING ENDED BY NO-IMPROVEMENT CRITERIA. OUTPUT FULLY GROUPED.;
%ELSE %PUT **** WARNING: LOOPING ENDED BY REACHING THE MAXIMUM NUMBER OF ITERARIONS. OUTPUT NOT FULLY GROUPED.;
%PUT **** NUMBER OF ITERATIONS: &ITERATIONS. (MAX: &MAXN.);
%PUT ****************************************************************;
%PUT ****************************************************************;
DATA &OUTDATA.;
SET _G_TEMP;
DROP &IDOUT._OLD;
RUN;
/* OUTPUT LOOKUP TABLE */
PROC SQL;
CREATE TABLE &OUTDATA._1 AS SELECT &ID1., MIN(&IDOUT.) AS &IDOUT. FROM _G_TEMP WHERE &ID1. IS NOT NULL GROUP BY &ID1. ORDER BY &ID1.;
CREATE TABLE &OUTDATA._2 AS SELECT &ID2., MIN(&IDOUT.) AS &IDOUT. FROM _G_TEMP WHERE &ID2. IS NOT NULL GROUP BY &ID2. ORDER BY &ID2.;
QUIT;
/* CLEAN UP */
PROC DATASETS NOLIST;
DELETE _G_:;
QUIT;
%MEND GROUPER2;