合并具有不同长度但具有共同列名的数据帧

时间:2018-11-30 01:49:57

标签: r dataframe merge plyr data-science

样本数据

df2 = structure(list(Country = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L,
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 25L, 42L, 43L, 44L, 45L,
46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L,
59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L,
72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L,
85L, 86L, 87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L,
98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 106L, 107L, 108L,
109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L, 118L, 119L,
120L, 121L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 129L, 130L,
131L, 132L, 133L, 134L, 135L, 136L, 137L, 138L, 139L, 140L, 141L,
142L, 143L, 144L, 145L, 146L, 147L, 148L, 149L, 150L, 151L, 152L,
153L, 154L, 155L, 156L, 157L, 158L, 159L, 160L, 161L, 162L, 163L,
164L, 165L, 166L), .Label = c(" Afghanistan", " Albania", " Algeria",
" Angola", " Argentina", " Armenia", " Australia", " Austria",
" Azerbaijan", " Bahamas", " Bangladesh", " Barbados", " Belarus",
" Belgium", " Belize", " Benin", " Bhutan", " Bolivia (Plurinational State of)",
" Bosnia and Herzegovina", " Botswana", " Brazil", " Bulgaria",
" Burkina Faso", " Burundi", " C\xaate d'Ivoire", " Cabo Verde",
" Cambodia", " Cameroon", " Canada", " Central African Republic",
" Chad", " Chile", " China", " Colombia", " Comoros", " Congo",
" Congo (Democratic Republic of the)", " Costa Rica", " Croatia",
" Cyprus", " Czechia", " Denmark", " Djibouti", " Dominican Republic",
" Ecuador", " Egypt", " El Salvador", " Estonia", " Eswatini (Kingdom of)",
" Ethiopia", " Fiji", " Finland", " France", " Gabon", " Gambia",
" Georgia", " Germany", " Ghana", " Greece", " Guatemala", " Guinea",
" Guinea-Bissau", " Guyana", " Haiti", " Honduras", " Hong Kong, China (SAR)",
" Hungary", " Iceland", " India", " Indonesia", " Iran (Islamic Republic of)",
" Iraq", " Ireland", " Israel", " Italy", " Jamaica", " Japan",
" Jordan", " Kazakhstan", " Kenya", " Kiribati", " Korea (Republic of)",
" Kyrgyzstan", " Lao People's Democratic Republic", " Latvia",
" Lebanon", " Lesotho", " Liberia", " Lithuania", " Luxembourg",
" Madagascar", " Malawi", " Malaysia", " Maldives", " Mali",
" Malta", " Mauritania", " Mauritius", " Mexico", " Micronesia (Federated States of)",
" Moldova (Republic of)", " Mongolia", " Montenegro", " Morocco",
" Mozambique", " Myanmar", " Namibia", " Nepal", " Netherlands",
" New Zealand", " Nicaragua", " Niger", " Nigeria", " Norway",
" Pakistan", " Palau", " Palestine, State of", " Panama", " Paraguay",
" Peru", " Philippines", " Poland", " Portugal", " Romania",
" Russian Federation", " Rwanda", " Saint Lucia", " Sao Tome and Principe",
" Senegal", " Serbia", " Sierra Leone", " Singapore", " Slovakia",
" Slovenia", " Solomon Islands", " South Africa", " South Sudan",
" Spain", " Sri Lanka", " Sudan", " Suriname", " Sweden", " Switzerland",
" Syrian Arab Republic", " Tajikistan", " Tanzania (United Republic of)",
" Thailand", " The former Yugoslav Republic of Macedonia", " Timor-Leste",
" Togo", " Trinidad and Tobago", " Tunisia", " Turkey", " Turkmenistan",
" Uganda", " Ukraine", " United Kingdom", " United States", " Uruguay",
" Uzbekistan", " Vanuatu", " Venezuela (Bolivarian Republic of)",
" Viet Nam", " Yemen", " Zambia", " Zimbabwe"), class = "factor"),
    ineq.HDI = c(0.463, 0.741, 0.729, 0.52, 0.813, 0.728, 0.923,
    0.895, 0.74, 0.789, 0.545, 0.782, 0.792, 0.903, 0.699, 0.473,
    0.566, 0.649, 0.713, 0.66, 0.727, 0.779, 0.375, 0.395, 0.629,
    0.537, 0.506, 0.902, 0.351, 0.371, 0.808, 0.706, 0.719, 0.482,
    0.557, 0.407, 0.754, 0.808, 0.85, 0.862, 0.442, 0.91, 0.449,
    0.703, 0.715, 0.665, 0.671, 0.845, 0.538, 0.412, 0.711, 0.903,
    0.882, 0.665, 0.441, 0.735, 0.921, 0.554, 0.856, 0.611, 0.404,
    0.426, 0.63, 0.47, 0.596, 0.901, 0.823, 0.891, 0.581, 0.661,
    0.755, 0.649, 0.909, 0.887, 0.87, 0.712, 0.885, 0.728, 0.765,
    0.543, 0.59, 0.884, 0.636, 0.546, 0.816, 0.758, 0.493, 0.407,
    0.824, 0.889, 0.504, 0.441, 0.772, 0.671, 0.403, 0.843, 0.487,
    0.749, 0.743, 0.608, 0.67, 0.697, 0.793, 0.616, 0.403, 0.53,
    0.594, 0.529, 0.91, 0.899, 0.621, 0.318, 0.484, 0.942, 0.526,
    0.769, 0.672, 0.758, 0.675, 0.717, 0.665, 0.835, 0.822, 0.797,
    0.78, 0.485, 0.731, 0.542, 0.456, 0.759, 0.385, 0.909, 0.829,
    0.882, 0.507, 0.649, 0.413, 0.865, 0.745, 0.47, 0.703, 0.905,
    0.932, 0.644, 0.634, 0.493, 0.724, 0.735, 0.619, 0.456, 0.775,
    0.716, 0.734, 0.673, 0.486, 0.733, 0.905, 0.914, 0.773, 0.666,
    0.591, 0.759, 0.654, 0.498, 0.544, 0.467), ineq.income. = c(NA,
    14.4, NA, 36.4, 34.4, 10.8, 16.6, 15.1, 4.4, 24.5, 14.8,
    16.1, 11.1, 15.1, 48.5, 19.2, NA, 54.2, 31.1, NA, 37.6, 16.1,
    25.3, 24.9, NA, 21.4, 19.9, 17.1, 28.1, 20.8, 34.1, 29.5,
    43.6, 54, 22, 22.1, 33.7, 27.8, 13.8, 12.2, 20.5, 11.3, 21.3,
    32.6, 23.4, 15.9, 32.7, 17.7, 32.3, 20.8, NA, 13.4, 13.9,
    22.1, 33.3, 25.9, 16.7, 25.4, 19.9, 42.5, 26.8, 32.5, 20.3,
    47.9, 39.7, NA, 14.7, 13.4, 14.7, 14.8, NA, NA, 18.8, 23.7,
    19.4, 24.1, NA, 18.7, 17.6, 28.8, NA, 18.4, 12.2, 15.5, 20.5,
    NA, 39.5, 19, 20.6, 15.7, 19.3, 19.7, 28.7, 19.5, 25.4, NA,
    21.5, NA, 31.6, 63.1, 19.4, 16.4, 12.6, 20.7, 58.1, NA, 68.3,
    26.4, 15.3, NA, 23.8, 21.1, 25.1, 13.1, 10.6, NA, NA, 52.6,
    33.4, 42.7, 28, 18.4, 23.9, 12.2, 11.9, 31.5, NA, NA, 21.1,
    12.2, 22.2, NA, 11.7, 12.2, NA, 40.9, NA, 18.5, 20.8, NA,
    34.9, 13, 14.3, 18.3, 15.3, 17.6, 34, 21.8, 19.2, 20, 21.9,
    21.8, 26.5, 38.7, 26.4, 10.4, 21, 23.5, 26.3, 17.9, NA, 32,
    18.2, 17.6, 20.8, 34.5), ineq.edu = c(39.3, 12.7, NA, 26.2,
    12.1, 6.5, 1.7, 2.4, 12, 7.9, 44.8, NA, 8, 5.2, 19.8, 44.1,
    NA, 28.7, 19.4, NA, 25.7, 8.1, 37.3, 36.3, 30.7, 31.1, 35.3,
    3.2, 45.9, 37.8, 13.3, 23.2, 23.9, 47.4, 30, 29.1, 17.7,
    NA, 15.7, 1.3, 44.8, 3, 47, 22.2, 21.8, 43.6, 29.2, 3.1,
    38.3, 38.2, 11, 4.7, 9.1, 7.3, 44.7, 4.9, 2.3, 7.5, 5.8,
    36.1, 42.6, 40.3, 9.6, 40.7, 31, NA, 4.1, 2.6, 40.6, 21.4,
    NA, NA, 3.2, 7.9, 11.8, 8.3, NA, 25.1, 5.3, 29.2, NA, NA,
    11.1, 30.5, 3.3, NA, 24.9, 46.4, 4.3, 6.2, 30.8, 34.7, NA,
    11.5, 36.9, NA, 43.2, NA, 17.9, 22.4, 7.5, 5.8, 9.6, 42.7,
    28.2, NA, 27.8, 43.3, 4, NA, 33.3, 31.3, 46, 2.4, 46.4, NA,
    NA, NA, 19.8, 30.2, 12.9, 7.1, NA, 13.1, 11.2, 30.7, NA,
    NA, 47.3, 11.1, 48.2, NA, 1.7, 4, NA, 20.8, NA, 5.7, 17.9,
    NA, 20.1, 3.6, 2, 31.5, 9.4, 28.7, 18, 17.5, 44.3, 5.1, 6.6,
    38.7, 27.4, NA, 28.2, 2.8, 2.1, 3.2, 10.8, 1.4, NA, 17, 17.1,
    49.8, 24.2, 20.1), ineq.adj.income = c(NA, 0.593, NA, 0.384,
    0.515, 0.573, 0.755, 0.78, 0.726, 0.616, 0.421, 0.638, 0.681,
    0.776, 0.331, 0.353, NA, 0.273, 0.478, NA, 0.467, 0.635,
    0.297, 0.23, NA, 0.378, 0.407, 0.75, 0.237, 0.35, 0.517,
    0.485, 0.396, 0.184, 0.433, 0.217, 0.484, 0.576, 0.755, 0.739,
    0.391, 0.818, 0.388, 0.476, 0.523, 0.581, 0.434, 0.668, 0.433,
    0.284, NA, 0.785, 0.771, 0.58, 0.279, 0.468, 0.758, 0.384,
    0.682, 0.363, 0.297, 0.269, 0.491, 0.213, 0.331, NA, 0.69,
    0.757, 0.486, 0.567, NA, NA, 0.726, 0.654, 0.717, 0.498,
    NA, 0.558, 0.646, 0.345, NA, 0.705, 0.432, 0.47, 0.627, NA,
    0.31, 0.223, 0.639, 0.819, 0.319, 0.281, 0.573, 0.569, 0.326,
    NA, 0.414, NA, 0.523, 0.197, 0.456, 0.538, 0.652, 0.497,
    0.138, NA, 0.211, 0.333, 0.784, NA, 0.425, 0.249, 0.439,
    0.846, 0.512, NA, NA, 0.355, 0.424, 0.392, 0.458, 0.659,
    0.641, 0.683, 0.72, 0.27, NA, NA, 0.366, 0.639, 0.293, NA,
    0.734, 0.749, NA, 0.425, NA, 0.71, 0.531, NA, 0.485, 0.801,
    0.826, 0.507, 0.421, 0.377, 0.485, 0.557, 0.535, 0.284, 0.672,
    0.544, 0.575, 0.418, 0.3, 0.588, 0.705, 0.719, 0.568, 0.468,
    NA, 0.524, 0.463, 0.466, 0.409, 0.251)), class = "data.frame", row.names = c(NA, -166L))

df1 = structure(list(Country = structure(1:97, .Label = c("Afghanistan",
"Albania", "Armenia", "Australia", "Austria", "Bahamas", "Barbados",
"Belarus", "Belgium", "Belize", "Bermuda", "Bolivia (Plurinational State of)",
"Bosnia and Herzegovina", "Brazil", "Brunei Darussalam", "Bulgaria",
"Cambodia", "Canada", "Chile", "Colombia", "Costa Rica", "Croatia",
"Cyprus", "Czechia", "Denmark", "Dominican Republic", "Ecuador",
"Egypt", "El Salvador", "Estonia", "Finland", "France", "Georgia",
"Germany", "Greece", "Grenada", "Guatemala", "Honduras", "Hong Kong SAR",
"Hungary", "Iceland", "Indonesia", "Iran (Islamic Republic of)",
"Ireland", "Israel", "Italy", "Jamaica", "Japan", "Kuwait", "Lao People's Democratic Republic",
"Latvia", "Lithuania", "Luxembourg", "Malaysia", "Maldives",
"Malta", "Mexico", "Moldova (Republic of)", "Montenegro", "Myanmar",
"Netherlands", "New Zealand", "Nicaragua", "Nigeria", "Norway",
"Pakistan", "Panama", "Paraguay", "Peru", "Philippines", "Poland",
"Portugal", "Puerto Rico", "Romania", "Russian Federation", "Saudi Arabia",
"Serbia", "Slovakia", "Slovenia", "South Africa", "South Korea",
"Spain", "Suriname", "Sweden", "Switzerland", "Taiwan", "Thailand",
"The former Yugoslav Republic of Macedonia", "Trinidad and Tobago",
"Turkey", "Ukraine", "United Kingdom (Scotland)", "United States of America",
"Uruguay", "Venezuela (Bolivarian Republic of)", "Viet Nam",
"Zambia"), class = "factor"), Amph.Average = c(0.5, 0.1, 0.04,
2.1, 0.5, 0.43, 0.2, 0.35, 0.9, 1.35, 0.1, 0.5, 0.5, 0.3, 0.25,
0.65, 0.6, 0.5, 0.27, 0.06, 0.05, 0.7, 0.3, 0.9, 0.7, 0.95, 0.2,
0.48, 0.38, 1, 0.8, 0.2, 0.56, 0.7, 0.05, 0.7, 0.91, 0.84, 0.35,
0.5, 0.93, 0.18, 0.7, 0.4, 1.01, 0.32, 0.81, 0.3, 0.27, 1.39,
0.3, 0.2, 0.73, 0.55, 0.1, 0.93, 0.12, 0.03, 0.72, 0.22, 0.4,
1.6, 0.76, 1.4, 0.7, 0.08, 1.2, 0.51, 0.2, 1.1, 1.9, 0.05, 0.43,
0.03, 0.39, 0.4, 0.1, 0.12, 0.72, 1.02, 0.12, 0.6, 0.5, 0.8,
0.63, 0.6, 1.4, 0.33, 0.7, 0.1, 0.72, 1.2, 1.8, 0.01, 0.47, 0.22,
0.1)), class = "data.frame", row.names = c(NA, -97L))

合并尝试= Merged frame

你好!我正在尝试合并两个长度不同的数据帧,但要合并一个匹配的列,其中一个可以被认为是较大的一个子集。我的不平等数据框架包含166个国家(行),另一个数据框架包含97个国家(行)。我正在尝试将这两者合并为一个数据框,将NA留在不匹配的69个国家/地区。

我已经尝试过testmerge = merge.data.frame(df2, df1, by.x = "Country", by.y = "Country", all.x = "TRUE")及其许多变体,但是这会在形成Amph.Average NA的所有行的同时输出较大数据帧中的所有数据。

如何将这两个数据帧合并在一起,在一个数据帧中将较大数据帧的所有列和较小数据帧的Amph.Average列合并在一起,而在其中NA没有比赛吗?

我还尝试了dplyr::left_join(df2, df1)plyr::join(df2, df1, type="full),它们给出的输出基本上相同。

1 个答案:

答案 0 :(得分:0)

问题在于df2$Country中附加的前导空格(df1$Country中不存在)。

我们可以在合并之前使用trimws修剪空白:

merge(df1, transform(df2, Country = trimws(Country)), by = "Country", all.y = T)
#                                      Country Amph.Average ineq.HDI
#1                                 Afghanistan         0.50    0.463
#2                                     Albania         0.10    0.741
#3                                     Armenia         0.04    0.728
#4                                   Australia         2.10    0.923
#5                                     Austria         0.50    0.895
#6                                     Bahamas         0.43    0.789
#7                                    Barbados         0.20    0.782
#8                                     Belarus         0.35    0.792
#9                                     Belgium         0.90    0.903
#10                                     Belize         1.35    0.699
#...

或者在tidyverse

library(tidyverse)
full_join(df2, df1 %>% mutate(Country = trimws(Country)))