我正在尝试从较大的数据帧中创建一个较小的数据帧,具体取决于2个数据帧中“row.ID”列中的相似行。
我一直在尝试应用match()
,subset()
和merge()
,但从未得到我需要的结果。
这是我的数据框架的样子
file1<- structure(list(row.ID = c(1, 22, 51, 31, 231, 21, 551, 13, 10,
11, 12, 83, 84, 86, 87, 89, 120, 91, 92, 311, 94, 187, 98), Col0 = c(1,
2, 3, 4, 5, 6, 12, 13, 15, 16, 18, 126, 128, 131, 132, 135, 136,
137, 139, 140, 141, 143, 148), V1 = c(238.27, 294.39, 413.3,
853.24, 7.06, 8.987, 41.73, 39.232, 11.151, 13.472, 8.041, 8.057,
7.961, 7.43, 8.047, 334.54, 229.03, 265.36, 354.49, 151.25, 237.75,
901.24, 280.27), V2 = c(7.686, 6.846, 10.08, 6.666, 26.741, 29.358,
12.885, 11.982, 22.898, 12.75, 21.041, 28.87, 12.316, 19.778,
71.023, 11.151, 13.472, 8.041, 8.057, 7.961, 7.43, 8.047, 9.342
), V3 = c(19.063, 25.17, 29.626, 79.233, 38.952, 42.658, 13.015,
12.244, 30.044, 17.862, 33.345, 44.065, 18.713, 31.822, 113.207,
22.898, 12.75, 21.041, 28.87, 12.316, 19.778, 71.023, 21.963),
V4 = c(31.814, 43.349, 42.989, 125.904, 28.853, 30.392, 16.483,
16.335, 25.648, 13, 22.347, 30.699, 13.699, 21.409, 75.841,
30.044, 17.862, 33.345, 44.065, 18.713, 31.822, 113.207,
30.905), V5 = c(19.398, 26.443, 29.687, 85.433, 43.737, 46.906,
12.413, 12.409, 32.337, 18.715, 36.953, 49.575, 21.079, 35.973,
124.988, 25.648, 13, 22.347, 30.699, 13.699, 21.409, 75.841,
21.904), V6 = c(35.325, 48.986, 45.76, 334.54, 0.75, 12,
241.34, 258.34, 282.4, 377.46, 30.392, 16.483, 0.648, 0.618,
0.634, 32.337, 18.715, 36.953, 49.575, 21.079, 35.973, 124.988,
33.416), V7 = c(0.615, 294.39, 413.3, 1.001, 1.051, 17, 1.011,
0.985, 0.974, 1.016, 46.906, 12.413, 377.46, 500.76, 470.78,
334.54, 0.75, 0.638, 0.656, 0.648, 0.618, 0.634, 0.732),
V8 = c(1.026, 1.008, 1.049, 10, 21, 12, 227.31, 241.34, 258.34,
282.4, 377.46, 500.76, 1.016, 1.085, 1.02, 1.001, 1.051,
1.01, 1.001, 0.985, 0.994, 1.011, 1.03), V9 = c(0.626, 46.906,
12.413, 12.409, 32.337, 18.715, 17, 0.678, 0.664, 0.656,
0.723, 0.721, 0.724, 1.374, 1.361, 0.855, 0.765, 0.677, 0.698,
0.721, 0.669, 0.677, 0.73), V10 = c(1.14, 377.46, 500.76,
470.78, 334.54, 0.75, 12, 241.34, 258.34, 282.4, 377.46,
30.392, 16.483, 16.335, 25.648, 13, 0.648, 0.618, 0.634,
32.337, 18.715, 36.953, 49.575), V11 = c(31, 1.016, 1.085,
1.02, 1.001, 1.051, 17, 1.011, 0.985, 0.974, 1.016, 46.906,
12.413, 12.409, 32.337, 18.715, 377.46, 500.76, 470.78, 334.54,
0.75, 0.638, 0.656), V12 = c(17, 32, 30, 12, 10, 21, 12,
227.31, 241.34, 258.34, 282.4, 377.46, 500.76, 470.78, 334.54,
0.75, 1.016, 1.085, 1.02, 1.001, 1.051, 1.01, 1.001), V13 = c(31,
43, 43, 132, 21, 0.99, 1, 1.016, 1.011, 0.985, 0.974, 1.016,
1.085, 1.02, 1.001, 1.051, 0.724, 1.374, 1.361, 0.855, 0.765,
0.677, 0.698), V14 = c(17, 21, 31, 0.985, 0.974, 1.016, 1.085,
9, 16, 17, 23, 32, 30, 12, 10, 21, 16.483, 16.335, 25.648,
13, 1.101, 1.12, 1.127), V15 = c(9, 9, 25, 17, 23, 32, 30,
0, 8, 8, 21, 6, 21, 6, 6, 7, 12.413, 12.409, 32.337, 18.715,
17, 33, 44)), .Names = c("row.ID", "Col0", "V1", "V2", "V3",
"V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13",
"V14", "V15"), row.names = c(NA, 23L), class = "data.frame")
file2<- structure(list(row.ID = c(83, 94, 98), X = c(1077, 1171, 1205
), V1 = c(-1.278147106, -0.895961572, -1.491168551), NO.YES = structure(c(1L,
1L, 1L), .Label = "NO", class = "factor"), YES.NO = structure(c(1L,
1L, 1L), .Label = "YES", class = "factor"), P.Y = c(0.168275205,
0.264104166, 0.128155717), P.NO = c(0.831724795, 0.735895834,
0.871844283)), .Names = c("row.ID", "X", "V1", "NO.YES", "YES.NO",
"P.Y", "P.NO"), row.names = c(NA, 3L), class = "data.frame")
我想要的输出是具有以下结构的数据帧。
row ID Col0 V1 V2 . . V15
83 126 8.057 28.87 . . 6
94 141 237.75 7.43 . . 17
98 148 280.27 9.342 . . 44
我主要尝试使用merge(),类似
mylist <- merge(file1,file2,by="row.ID")
但它造成了很多麻烦。任何简单的命令都可以做到这一点??
答案 0 :(得分:1)
第一个代码为文件1提供了row.ID中的哪些行位于文件2的row.ID行中。因此,当您使用merge
命令时,它不会为您提供文件2的列。您可以按如下方式为k2使用单独的类似代码来生成file.ID匹配的row2行。
k1<-file1[file1$row.ID %in% file2$row.ID,]
> k1
row.ID Col0 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
12 83 126 8.057 28.870 44.065 30.699 49.575 16.483 12.413 500.760 0.721 30.392 46.906 377.460 1.016 32.000 6
21 94 141 237.750 7.430 19.778 31.822 21.409 35.973 0.618 0.994 0.669 18.715 0.750 1.051 0.765 1.101 17
23 98 148 280.270 9.342 21.963 30.905 21.904 33.416 0.732 1.030 0.730 49.575 0.656 1.001 0.698 1.127 44
k2<-file2[file2$row.ID %in% file1$row.ID,]
> k2
row.ID X V1 NO.YES YES.NO P.Y P.NO
1 83 1077 -1.2781471 NO YES 0.1682752 0.8317248
2 94 1171 -0.8959616 NO YES 0.2641042 0.7358958
3 98 1205 -1.4911686 NO YES 0.1281557 0.8718443
注意:如果您使用merge
,则两个文件中都有V1,因此会创建V1.x
和V1.y
。
> merge(file1,file2,"row.ID")
row.ID Col0 V1.x V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 X V1.y NO.YES
1 83 126 8.057 28.870 44.065 30.699 49.575 16.483 12.413 500.760 0.721 30.392 46.906 377.460 1.016 32.000 6 1077 -1.2781471 NO
2 94 141 237.750 7.430 19.778 31.822 21.409 35.973 0.618 0.994 0.669 18.715 0.750 1.051 0.765 1.101 17 1171 -0.8959616 NO
3 98 148 280.270 9.342 21.963 30.905 21.904 33.416 0.732 1.030 0.730 49.575 0.656 1.001 0.698 1.127 44 1205 -1.4911686 NO
YES.NO P.Y P.NO
1 YES 0.1682752 0.8317248
2 YES 0.2641042 0.7358958
3 YES 0.1281557 0.8718443
答案 1 :(得分:1)
这是@James Pringle的回答(在上面的评论中)使用data.tables:
file1dt <- data.table(file1)
file2dt <- data.table(file2)
setkey(file1dt,row.ID)
file1dt[J(file2dt$row.ID)]
如果您想要两个文件中的列,您只需执行file1dt[file2dt]
。 OP表示“我的数据将与其他数据文件一次又一次地进行比较”,因此data.tables(应该进行快速合并/连接)可能会有所帮助。适用于data.frames的函数也适用于data.tables。