我遇到一个简单merge
的问题,我无法弄清问题是什么。
我想合并两个数据库dt
和mt
。每个人idno_simple
被观察两次。像这样:
householdid.x idno_simple idno numchild.x day_rec isex
1 101366 1013661 1013661 2 Weekdays FEMALE
2 101366 1013661 1013661 2 Weekend FEMALE
3 101366 1013662 1013662 2 Weekdays MALE
4 101366 1013662 1013662 2 Weekend MALE
5 102481 1024811 1024811 0 Weekdays FEMALE
6 102481 1024811 1024811 0 Weekend FEMALE
我不明白为什么当我合并两个基地时,突然idno_simple
被重复4次。
mdt = merge(dt, mt, by = 'idno_simple')
idno_simple householdid.x idno numchild.x day_rec isex hldid_sim persid_sim id_sim nchild_sim day_sim diary_sim sex_sim
1 1013661 101366 1013661 2 Weekdays FEMALE 101366 1 1 2 5 1 2
2 1013661 101366 1013661 2 Weekdays FEMALE 101366 1 2 2 1 2 2
3 1013661 101366 1013661 2 Weekend FEMALE 101366 1 1 2 5 1 2
4 1013661 101366 1013661 2 Weekend FEMALE 101366 1 2 2 1 2 2
5 1013662 101366 1013662 2 Weekdays MALE 101366 2 1 2 5 1 1
6 1013662 101366 1013662 2 Weekdays MALE 101366 2 2 2 1 2 1
第一个数据库
dt = structure(list(householdid.x = c("101366", "101366", "101366",
"101366", "102481", "102481", "102481", "102481", "103755", "103755",
"103755", "103755", "103788", "103788", "103788", "103788", "103799",
"103799", "103799", "103799"), idno_simple = c("1013661", "1013661",
"1013662", "1013662", "1024811", "1024811", "1024812", "1024812",
"1037551", "1037551", "1037552", "1037552", "1037881", "1037881",
"1037882", "1037882", "1037991", "1037991", "1037992", "1037992"
), idno = c(1013661, 1013661, 1013662, 1013662, 1024811, 1024811,
1024812, 1024812, 1037551, 1037551, 1037552, 1037552, 1037881,
1037881, 1037882, 1037882, 1037991, 1037991, 1037992, 1037992
), numchild.x = structure(c(3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("0",
"1", "2", "3", "4", "5"), class = "factor"), day_rec = c("Weekdays",
"Weekend", "Weekdays", "Weekend", "Weekdays", "Weekend", "Weekend",
"Weekdays", "Weekdays", "Weekend", "Weekend", "Weekdays", "Weekend",
"Weekdays", "Weekdays", "Weekend", "Weekdays", "Weekend", "Weekend",
"Weekdays"), isex = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("FEMALE",
"MALE"), class = "factor")), class = "data.frame", .Names = c("householdid.x",
"idno_simple", "idno", "numchild.x", "day_rec", "isex"), row.names = c(NA,
-20L))
第二个数据
mt = structure(list(hldid_sim = c(101366, 101366, 101366, 101366,
102481, 102481, 102481, 102481, 103755, 103755, 103755, 103755,
103788, 103788, 103788, 103788, 103799, 103799, 103799, 103799
), persid_sim = c(1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2,
2, 1, 1, 2, 2), id_sim = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), nchild_sim = c(2L,
2L, 2L, 2L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), idno_simple = c("1013661", "1013661", "1013662",
"1013662", "1024811", "1024811", "1024812", "1024812", "1037551",
"1037551", "1037552", "1037552", "1037881", "1037881", "1037882",
"1037882", "1037991", "1037991", "1037992", "1037992"), day_sim = c(5L,
1L, 5L, 1L, 1L, 4L, 1L, 4L, 6L, 7L, 6L, 7L, 7L, 3L, 7L, 3L, 1L,
4L, 1L, 4L), diary_sim = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), sex_sim = c(2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 1L)), .Names = c("hldid_sim", "persid_sim", "id_sim",
"nchild_sim", "idno_simple", "day_sim", "diary_sim", "sex_sim"
), row.names = c(NA, 20L), class = "data.frame")
有什么想法吗?
在我看来,输出应该遵循个人结构的2行。我不确定是否有可能。
idno_simple householdid.x idno numchild.x day_rec isex hldid_sim persid_sim id_sim nchild_sim day_sim diary_sim sex_sim
1 1013661 101366 1013661 2 Weekdays FEMALE 101366 1 1 2 5 1 2
2 1013661 101366 1013661 2 Weekend FEMALE 101366 1 2 2 1 2 2
3 1013662 101366 1013662 2 Weekdays MALE 101366 2 1 2 5 1 1
4 1013662 101366 1013662 2 Weekend MALE 101366 2 2 2 1 2 1
答案 0 :(得分:1)
您可以使用getanID
中的library(splitstackshape)
根据“idno_simple”中的重复元素在每个数据集中创建.id
列。我们使用devel版本data.table
加入数据集,该版本也有on
选项,因此我们无需设置key
。安装devel版本的说明是here
library(splitstackshape)
library(data.table)#v1.9.5+
getanID(dt, 'idno_simple')[getanID(mt, 'idno_simple'),
on=c('idno_simple', '.id')]
或者基础R选项将使用ave
创建序列列,然后使用merge
,如评论
dt$indx <-with(dt, ave(1:nrow(dt), idno_simple, FUN=seq_along))
mt$indx <-with(mt, ave(1:nrow(mt), idno_simple, FUN=seq_along))
merge(dt, mt, by = c('idno_simple', 'indx'))