通过重叠row.names和mean值来合并/组合两个数据集的最有效方法

时间:2015-04-16 12:28:38

标签: r

我想找到最有效的方法来组合两个数据帧并平均具有不同row.names的列中的值。所以,我想从两个数据中取jsut重叠的row.names并将它们合并为一个。列的值应按平均值进行平均值。示例数据:

mtcars <- 
structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4, 
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8, 
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4), 
    disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 
    167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7, 
    71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145, 
    301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95, 
    123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150, 
    150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9, 
    3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 
    3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 
    3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
    ), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 
    3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 
    1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14, 
    1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61, 
    19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6, 
    18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87, 
    17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
    ), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 
    0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1, 
    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3, 
    3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 
    3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4, 
    2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1, 
    2, 2, 4, 6, 8, 2)), .Names = c("mpg", "cyl", "disp", "hp", 
"drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("Mazda RX4", 
"Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", 
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280", 
"Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", 
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", 
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", 
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2", 
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora", 
"Volvo 142E"), class = "data.frame")

第二个数据:

mtcars11 <- 
structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4, 
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8, 
19.7), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 
8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6), disp = c(160, 160, 
108, 258, 360, 225, 360, 146.7, 140.8, 167.6, 167.6, 275.8, 275.8, 
275.8, 472, 460, 440, 78.7, 75.7, 71.1, 120.1, 318, 304, 350, 
400, 79, 120.3, 95.1, 351, 145), hp = c(110, 110, 93, 110, 175, 
105, 245, 62, 95, 123, 123, 180, 180, 180, 205, 215, 230, 66, 
52, 65, 97, 150, 150, 245, 175, 66, 91, 113, 264, 175), drat = c(3.9, 
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 3.07, 
3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 3.15, 
3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62), wt = c(2.62, 2.875, 
2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 3.15, 3.44, 3.44, 4.07, 
3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 1.615, 1.835, 2.465, 3.52, 
3.435, 3.84, 3.845, 1.935, 2.14, 1.513, 3.17, 2.77), qsec = c(16.46, 
17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 
17.4, 17.6, 18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 
16.87, 17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5), vs = c(0, 
0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 
0, 0, 0, 1, 0, 1, 0, 0), am = c(1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1), 
    gear = c(4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 
    3, 4, 4, 4, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5), carb = c(4, 4, 
    1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 
    2, 2, 4, 2, 1, 2, 2, 4, 6)), .Names = c("mpg", "cyl", "disp", 
"hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("Mazda RX4", 
"Chrysler", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", 
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280", 
"Merc 280C", "Merc 450SE", "Nexia", "Merc 450SLC", "Cadillac Fleetwood", 
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", 
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", 
"Camaro Z28", "Pontiac Firebirda", "Punto", "Porsche 914-2", 
"Lotus Europa", "Ford Pantera T", "Ferrari Dino"), class = "data.frame")

所以我想到的解决方案是(长篇):

vec_names_mt <- row.names(mtcars) ## so we the row.names from first data
vec_names_mt11 <- row.names(mtcars11) ## so we the row.names from second data

vec_inter <- intersect(vec_names_mt, vec_names_mt11) ## find overlapping names

data_mt <- mtcars[row.names(mtcars) %in% vec_inter, ] ## take the rows from first data which overlaps
data_mt11 <- mtcars11[row.names(mtcars11) %in% vec_inter, ] ## take the rows from second data which overlaps

我们如何将它们结合起来并平均值?知道如何以最简单的方式做到这一点吗?

2 个答案:

答案 0 :(得分:7)

假设d1d2是您的data.frames,这就是我接近它的方式。但是,您必须使用development version of data.table (v1.9.5)才能mget工作。

require(data.table) # v1.9.5

setkey(setDT(d1, keep.rownames=TRUE), rn)
setkey(setDT(d2, keep.rownames=TRUE), rn)

xcols = names(d1)[-1L]
icols = paste("i.", xcols, sep="")

foo <- function(a, b) mean(c(a, b), na.rm=TRUE)
d1[d2, Map(foo, mget(xcols), mget(icols)), by=.EACHI, nomatch=0L]

我们首先使用setDT通过引用将 data.frames 转换为 data.tables ,然后将row names转换为新列(其中将自动命名为rn),并在该列上设置密钥。

setkey()按指定的列重新排序 data.table ,并将这些列标记为键列,这将有助于我们执行连接(在这些列上)关键栏)。

data.tables 中,可以使用x[i]表示法以及merge()函数(有 data.table )来完成联接已实施的方法),但x[i]更强大,更灵活。语法x[i]i的每一行连接到x中的匹配行(在键列上)。

因此,d1[d2]会为d2中的每一行返回d1中匹配的行以及d2中的所有其他列。

d1[d2, nomatch=0L]相当于内部联接,其中只返回匹配的行。

d1[d2, Map(foo, mget(xcols), mget(icols)), by=.EACHI, nomatch=0L]评估j = Map(...)中的表达式,d2中的每一行 - 因此by = .EACHI

总结一下,对于d2中的每一行,找到d1中的匹配行。仅为 匹配行提取xcolsicols中指定的列,并应用函数foo(),它将连接向量并取mean() }。并为d2by = .EACHI)的每一行执行此操作。忽略d2中关键列(d1nomatch=0L内没有任何匹配项的行。

希望这有帮助。

答案 1 :(得分:6)

您好像正在寻找&#34; 内部联接&#34;两个数据集之间的行名称。我建议尝试使用data.table包进行合并以及后来的融化 dcasting 操作。

首先,我会将mtcars重命名为mtcars2,因为mtcars是一个存储的数据集,我不希望两者都覆盖它,因为setDT实际上是mtcars2无法覆盖存储的数据集,因此我们可以说,在现实生活中,您的数据称为library(data.table) mtcars2 <- copy(mtcars)

data.table

接下来,我们将转换为setkey(setDT(mtcars2, keep.rownames = TRUE), rn) setkey(setDT(mtcars11, keep.rownames = TRUE), rn) 个对象,同时保留行名称,并为更快的连接设置密钥

rn

现在,我们将对suffixes = NULL(密钥)执行内部联接,同时使用Res <- merge(mtcars2, mtcars11, suffixes = NULL)

保留原始列名称
melt

现在我们可以rn dcast然后dcast(melt(Res, "rn"), rn ~ variable, mean.default) # rn mpg cyl disp hp drat wt qsec vs am gear carb # 1: AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 # 2: Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 # 3: Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 # 4: Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 # 5: Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 # 6: Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 # 7: Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 # 8: Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 # 9: Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 # 10: Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 # 11: Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 # 12: Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 # 13: Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 # 14: Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 # 15: Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 # 16: Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 # 17: Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 # 18: Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 # 19: Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 # 20: Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 # 21: Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 # 22: Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 # 23: Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 # 24: Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 # 25: Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 通过唯一列计算平均值

{{1}}