我有两张关于人的数据表:
df1 <- data.frame(id=c(113,202,377,288,359),
name=c("Alex","Silvia","Peter","Jack","Jonny"))
这为我提供了
id name
1 113 Alex
2 202 Silvia
3 377 Peter
4 288 Jack
5 359 Jonny
我有第二张表,其中包含其家庭成员的姓名:
df2 <- data.frame(id=c(113,113,113,202,202,359,359,359,359),
family.members=c("Ross","Jefferson","Max","Jo","Michael","Jimmy","Rex","Bill","Larry"))
这为我提供了:
> df2
id family.members
1 113 Ross
2 113 Jefferson
3 113 Max
4 202 Jo
5 202 Michael
6 359 Jimmy
7 359 Rex
8 359 Bill
9 359 Larry
现在我想扩展表1,增加一列,其中包含每个人的家庭成员总和:
id name no.family.memebers
1 113 Alex 3
2 202 Silvia 2
3 377 Peter 0
4 288 Jack 0
5 359 Jonny 4
在R中创建第三个表的最佳方法是什么?
非常感谢你!
答案 0 :(得分:8)
使用dplyr
library(dplyr)
df1 <- df1 %>% left_join((
df2 %>% group_by(id) %>%
summarize(no.family.members = n())
)
)
dplyr
&gt; = 0.3.0.2,可以将其重写为
df3 <- df1 %>% left_join(df2 %>% count(id))
答案 1 :(得分:5)
df1 <- df1[order(df1$id), ] # Just to be safe
# the counts vector will be ordered by df2$id
counts <- with (df2, tapply(family.members, id, length))
df1$no.family.members[df1$id %in% names(counts)]<- counts
df1
id name no.family.members
1 113 Alex 3
2 202 Silvia 2
4 288 Jack NA
5 359 Jonny 4
3 377 Peter NA
(我认为NA比0更具信息性。)
答案 2 :(得分:2)
我建议使用dplyr
代替data.frame
(将data.table
转换为data.table(my_data_frame)
,只需执行require(data.table)
df1 <- data.table(df1, key="id")
df2 <- data.table(df2, key="id")
rslt = df2[df1,allow.cartesian=TRUE][,list(name = unique(name), no.family.members=length(na.omit(family.members))),by=id]
#rslt
# id name no.family.members
#1: 113 Alex 3
#2: 202 Silvia 2
#3: 288 Jack 0
#4: 359 Jonny 4
#5: 377 Peter 0
:
{{1}}
答案 3 :(得分:2)
这是另一个data.table
版本
library(data.table)
setkey(setDT(df2), id)[, list(no.family.memebers = .N), by = id][df1]
# id no.family.memebers name
# 1: 113 3 Alex
# 2: 202 2 Silvia
# 3: 288 NA Jack
# 4: 359 4 Jonny
# 5: 377 NA Peter
或者对于v 1.9.4+,使用.EACHI
(由@Arun提供)
setkey(setDT(df2), id)[df1, list(no.family.memebers = .N, name), by=.EACHI]
# id no.family.memebers name
# 1: 113 3 Alex
# 2: 202 2 Silvia
# 3: 377 0 Peter
# 4: 288 0 Jack
# 5: 359 4 Jonny
在更大的数据集上添加一些基准测试(这个答案中的data.table
解决方案到目前为止都是胜利,而.EACHI
实施是最有效的一个)
library(dplyr)
library(data.table)
library(microbenchmark)
df1 <- data.frame(id=c(seq_len(26)),
name = LETTERS)
set.seed(123)
n <- 1e6
df2 <- data.frame(id = sample(seq_len(26), n, replace = TRUE),
family.members = sample(letters, n, replace = TRUE))
df1.1 <- copy(df1)
df2.2 <- copy(df2)
Gregordplyr <- function(df1, df2) {
df1 %>% left_join(
df2 %>% group_by(id) %>%
summarize(no.family.members = n()))
}
begineRdplyr <- function(df1, df2) {
df1 %>% left_join(df2 %>% count(id))
}
BDbaseR <- function(df1, df2) {
df1 <- df1[order(df1$id), ]
counts <- with (df2, tapply(family.members, id, length))
df1$no.family.members[df1$id %in% names(counts)]<- counts
df1
}
AlexDT <- function(df1, df2) {
df1 <- data.table(df1, key="id")
df2 <- data.table(df2, key="id")
df2[df1,allow.cartesian=TRUE][,
list(name = unique(name),
no.family.members=length(na.omit(family.members))),
by=id]
}
DavdDT <- function(df1, df2) {
setkey(setDT(df2), id)[, list(no.family.memebers = .N), by = id][df1]
}
DavdDTV2 <- function(df1, df2) {
setkey(setDT(df2), id)[, list(no.family.memebers = .N), by = id][setkey(setDT(df1), id)]
}
ArunDT <- function(df1, df2) {
setkey(setDT(df2), id)[df1, list(no.family.memebers = .N, name), by=.EACHI]
}
ArunDTV2 <- function(df1, df2) {
setkey(setDT(df2), id)[setkey(setDT(df1), id), list(no.family.memebers = .N, name), by=.EACHI]
}
Res <- microbenchmark(Gregordplyr(df1, df2),
begineRdplyr(df1, df2),
BDbaseR(df1, df2),
AlexDT(df1.1, df2.2),
ArunDT(df1.1, df2.2),
ArunDTV2(df1.1, df2.2),
DavdDT(df1.1, df2.2),
DavdDTV2(df1.1, df2.2)
)
Res
# Unit: milliseconds
# expr min lq mean median uq max neval
# Gregordplyr(df1, df2) 43.567614 46.486239 51.154432 47.943481 50.711707 93.40908 100
# begineRdplyr(df1, df2) 43.817494 46.105103 51.298581 47.878149 50.613609 125.07362 100
# BDbaseR(df1, df2) 88.098035 97.065111 121.290967 129.912539 137.914435 179.60281 100
# AlexDT(df1.1, df2.2) 55.004083 63.029861 88.840319 99.043231 104.272165 284.40967 100
# ArunDT(df1.1, df2.2) 4.608774 4.967607 6.621559 5.412694 6.584724 45.88562 100
# ArunDTV2(df1.1, df2.2) 4.870497 5.305124 6.381737 5.593097 6.429782 34.93075 100
# DavdDT(df1.1, df2.2) 8.578043 9.074449 11.943810 9.585854 10.693341 55.91518 100
# DavdDTV2(df1.1, df2.2) 8.822792 9.508088 11.467790 9.970544 11.009343 51.58866 100
boxplot(Res)