R - 合并标识符/行问题

时间:2015-07-26 13:18:26

标签: r merge

我遇到一个简单merge的问题,我无法弄清问题是什么。

我想合并两个数据库dtmt。每个人idno_simple被观察两次。像这样:

  householdid.x idno_simple    idno numchild.x  day_rec   isex
1        101366     1013661 1013661          2 Weekdays FEMALE
2        101366     1013661 1013661          2  Weekend FEMALE
3        101366     1013662 1013662          2 Weekdays   MALE
4        101366     1013662 1013662          2  Weekend   MALE
5        102481     1024811 1024811          0 Weekdays FEMALE
6        102481     1024811 1024811          0  Weekend FEMALE

我不明白为什么当我合并两个基地时,突然idno_simple被重复4次。

mdt  = merge(dt, mt, by = 'idno_simple')

 idno_simple householdid.x    idno numchild.x  day_rec   isex hldid_sim persid_sim id_sim nchild_sim day_sim diary_sim sex_sim
1     1013661        101366 1013661          2 Weekdays FEMALE    101366          1      1          2       5         1       2
2     1013661        101366 1013661          2 Weekdays FEMALE    101366          1      2          2       1         2       2
3     1013661        101366 1013661          2  Weekend FEMALE    101366          1      1          2       5         1       2
4     1013661        101366 1013661          2  Weekend FEMALE    101366              1      2          2       1         2       2
5     1013662        101366 1013662          2 Weekdays   MALE    101366              2      1          2       5         1       1
6     1013662        101366 1013662          2 Weekdays   MALE    101366              2      2          2       1         2       1

第一个数据库

dt = structure(list(householdid.x = c("101366", "101366", "101366", 
"101366", "102481", "102481", "102481", "102481", "103755", "103755", 
"103755", "103755", "103788", "103788", "103788", "103788", "103799", 
"103799", "103799", "103799"), idno_simple = c("1013661", "1013661", 
"1013662", "1013662", "1024811", "1024811", "1024812", "1024812", 
"1037551", "1037551", "1037552", "1037552", "1037881", "1037881", 
"1037882", "1037882", "1037991", "1037991", "1037992", "1037992"
), idno = c(1013661, 1013661, 1013662, 1013662, 1024811, 1024811, 
1024812, 1024812, 1037551, 1037551, 1037552, 1037552, 1037881, 
1037881, 1037882, 1037882, 1037991, 1037991, 1037992, 1037992
), numchild.x = structure(c(3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 
 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("0", 
"1", "2", "3", "4", "5"), class = "factor"), day_rec = c("Weekdays", 
 "Weekend", "Weekdays", "Weekend", "Weekdays", "Weekend", "Weekend", 
"Weekdays", "Weekdays", "Weekend", "Weekend", "Weekdays", "Weekend", 
"Weekdays", "Weekdays", "Weekend", "Weekdays", "Weekend", "Weekend", 
"Weekdays"), isex = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 
1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("FEMALE", 
"MALE"), class = "factor")), class = "data.frame", .Names =  c("householdid.x", 
"idno_simple", "idno", "numchild.x", "day_rec", "isex"), row.names = c(NA, 
-20L))

第二个数据

mt = structure(list(hldid_sim = c(101366, 101366, 101366, 101366, 
102481, 102481, 102481, 102481, 103755, 103755, 103755, 103755, 
103788, 103788, 103788, 103788, 103799, 103799, 103799, 103799
), persid_sim = c(1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 
2, 1, 1, 2, 2), id_sim = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), nchild_sim = c(2L, 
2L, 2L, 2L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L), idno_simple = c("1013661", "1013661", "1013662", 
"1013662", "1024811", "1024811", "1024812", "1024812", "1037551", 
"1037551", "1037552", "1037552", "1037881", "1037881", "1037882", 
"1037882", "1037991", "1037991", "1037992", "1037992"), day_sim = c(5L, 
1L, 5L, 1L, 1L, 4L, 1L, 4L, 6L, 7L, 6L, 7L, 7L, 3L, 7L, 3L, 1L, 
4L, 1L, 4L), diary_sim = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), sex_sim = c(2L, 
  2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
2L, 1L, 1L)), .Names = c("hldid_sim", "persid_sim", "id_sim", 
"nchild_sim", "idno_simple", "day_sim", "diary_sim", "sex_sim"
), row.names = c(NA, 20L), class = "data.frame")

有什么想法吗?

在我看来,输出应该遵循个人结构的2行。我不确定是否有可能。

 idno_simple householdid.x    idno numchild.x  day_rec   isex hldid_sim persid_sim id_sim nchild_sim day_sim diary_sim sex_sim
1     1013661        101366 1013661          2 Weekdays FEMALE    101366          1      1          2       5         1       2
2     1013661        101366 1013661          2  Weekend FEMALE    101366          1      2          2       1         2       2
3     1013662        101366 1013662          2 Weekdays   MALE    101366          2      1          2       5         1       1
4     1013662        101366 1013662          2  Weekend   MALE    101366          2      2          2       1         2       1

1 个答案:

答案 0 :(得分:1)

您可以使用getanID中的library(splitstackshape)根据“idno_simple”中的重复元素在每个数据集中创建.id列。我们使用devel版本data.table加入数据集,该版本也有on选项,因此我们无需设置key。安装devel版本的说明是here

 library(splitstackshape)
 library(data.table)#v1.9.5+
 getanID(dt, 'idno_simple')[getanID(mt, 'idno_simple'), 
                                   on=c('idno_simple', '.id')]

或者基础R选项将使用ave创建序列列,然后使用merge,如评论

  dt$indx <-with(dt, ave(1:nrow(dt), idno_simple, FUN=seq_along))
  mt$indx <-with(mt, ave(1:nrow(mt), idno_simple, FUN=seq_along))
  merge(dt, mt, by = c('idno_simple', 'indx'))