在R

时间:2017-10-12 00:37:49

标签: r dataframe merge

我有一个不同长度的数据框列表(df),按年份编制索引,以便数据的代理如下:

df

  $df1          
         X..i..
  1999     10
  1998     13 
  1997     14

 $df2
         X..i..
 1999      20
 1998      11

 $df3
         X..i..
 1999      17
 1998       8
 1997       9
 1996      19

我想使用并保留索引/ rownames将这些数据帧组合到单个数据框

那样:

df_all
 Index     df1   df2   df3
  1999      10    20    17
  1998      13    11     8
  1997      14    n/a    9
  1996      n/a   n/a   19

编辑:

smalldflist <- lapply(bai_df, function(i) head(i, 10))
dput(smalldflist)

产生以下输出:

结构(列表(IN_DonaldsonWoods_QUAL.txt =结构(列表(X..i .. = c)(4.5528243479162, 32.6474339976978,52.7116018957456,170.932582874866,227.0430440174, 191.462399206825,226.94053541991,274.854835798233,336.457600434571, 409.132933511232)),。Name =“X..i ..”,row.names = c(“1725”, “1726”,“1727”,“1728”,“1729”,“1730”,“1731”,“1732”,“1733”, “1734”),class =“data.frame”),IN_DonaldsonWoods_QURU.txt = structure(list(list)     X..i .. = c(4.33729067152776,5.72878688080428,13.0247658962315,     22.0205798005054,25.9885943197615,18.9273551074104,43.5197887382031,     58.2775710248884,72.9225976242458,73.0466756114972)),。Name =“X..i ..”,row.names = c(“1827”, “1828”,“1829”,“1830”,“1831”,“1832”,“1833”,“1834”,“1835”, “1836”),class =“data.frame”),IN_DonaldsonWoods_QUVE.txt = structure(list(     X..i .. = c(7.87253273859391,18.9481296742303,42.5055176960097,     62.9980951594496,88.906442207264,74.2523230533691,106.911242713809,     152.445167763284,192.399603839633,221.263660216113)),。Name =“X..i ..”,row.names = c(“1731”, “1732”,“1733”,“1734”,“1735”,“1736”,“1737”,“1738”,“1739”, “1740”),class =“data.frame”),IN_LillyDickey_QUAL.txt = structure(list(     X..i .. = c(8.29576810088555,17.2934968058816,31.2091720401804,     33.8966066349882,47.6496887415004,32.9921546763907,82.2281435044324,     108.068226885475,103.894002151431,110.255812097949)),. Name =“X..i ..”,row.names = c(“1863”, “1864”,“1865”,“1866”,“1867”,“1868”,“1869”,“1870”,“1871”, “1872”),class =“data.frame”),IN_LillyDickey_QUMO.txt = structure(list(     X..i .. = c(3.42413493048312,8.0847630303073,19.6833503197648,     13.791136218324,21.4638165402601,30.6707376168741,30.8789937938806,     26.8661212585221,24.0732956549621,29.7872997715364)),。Name =“X..i ..”,row.names = c(“1867”, “1868”,“1869”,“1870”,“1871”,“1872”,“1873”,“1874”,“1875”, “1876”),class =“data.frame”),IN_Pioneers_QUAL.txt = structure(list(     X..i .. = c(9.14340435634345,23.5108626053757,33.8507393822465,     46.1027716604662,57.5247983011993,50.5892015892391,92.2448163706925,     225.832932372368,278.367628044195,193.931508820174)),。Name =“X..i ..”,row.names = c(“1817”, “1818”,“1819”,“1820”,“1821”,“1822”,“1823”,“1824”,“1825”, “1826”),class =“data.frame”),IN_Pioneers_QURU.txt = structure(list(     X..i .. = c(122.443727611702,658.649900930018,830.471777578934,     843.357139228152,1725.6495913006,1244.38668477703,973.00892131628,     1294.7441782001,1717.18570086886,1676.63841798444)),。Name =“X..i ..”,row.names = c(“1861”, “1862”,“1863”,“1864”,“1865”,“1866”,“1867”,“1868”,“1869”, “1870”),class =“data.frame”),OH_JohnsonWoods_QUAL.txt = structure(list(     X..i .. = c(1.9113449704439,3.39794661412248,5.32688450342693,     6.41921626908008,11.0307601252838,13.0825342873437,14.843680070585,     16.885746353779,20.1011664347289,19.853294774361)),。Name =“X..i ..”,row.names = c(“1626”, “1627”,“1628”,“1629”,“1630”,“1631”,“1632”,“1633”,“1634”, “1635”),class =“data.frame”)),. Name = c(“IN_DonaldsonWoods_QUAL.txt”, “IN_DonaldsonWoods_QURU.txt”,“IN_DonaldsonWoods_QUVE.txt”,“IN_LillyDickey_QUAL.txt”, “IN_LillyDickey_QUMO.txt”,“IN_Pioneers_QUAL.txt”,“IN_Pioneers_QURU.txt”, “OH_JohnsonWoods_QUAL.txt”))

3 个答案:

答案 0 :(得分:1)

您可以使用Reducemerge多个数据框。设置all = TRUE,在没有匹配时添加NAs。注意df是您设置的数据框列表,by表示用于合并的列。因此,在您的数据框列表中,&#34;索引&#34;应该是每个数据框中年份列的名称。

Reduce(function(...) merge(..., by="Index", all=TRUE), df)

感谢@jazzuro提供样本数据,以下是基础R中使用Reduce的等效解决方案。在此示例中设置用于合并by="year"的列:

df1 <- data.frame(year = c(1999, 1998, 1997),
              value = c(10, 13, 14))

df2 <- data.frame(year = c(1999, 1998),
              value = c(20, 11))

df3 <- data.frame(year = c(1999, 1998, 1997, 1996),
              value = c(17, 8, 9, 19))
df <- list(df1=df1, df2=df2, df3=df3)
df_merge  <- Reduce(function(...) merge(..., by="year", all=TRUE), df)
colnames(df_merge) <- c("Index", names(df))

#   Index df1 df2 df3
# 1 1996  NA  NA  19
# 2 1997  14  NA   9
# 3 1998  13  11   8
# 4 1999  10  20  17

答案 1 :(得分:1)

如果您拥有仅在全球环境中需要的数据,则可以尝试以下操作。首先,您在所有数据框中收集独特年份并创建主数据框,其中仅包含唯一年份。然后,将所有数据框放在一个列表中,并将每个数据框与master合并。由于您在temp中拥有主数据框,因此将其删除。最后,绑定所有数据帧并将长格式更改为宽格式。

library(tidyverse)

# Create a data frame with all unique years

master <- data.frame(year = mget(ls()) %>%
                            sapply(`[`, 1) %>%
                            as_vector %>% 
                            unique)

# Merge each data frame with the master df
temp <- mget(ls()) %>%
        lapply(function(x){full_join(x, master, by = "year")})

# Remove the master df in the list
temp[["master"]] <- NULL

# Bind all dfs and make it wide.
bind_rows(temp, .id = "data") %>%
spread(key = data, value = value)

#  year df1 df2 df3
#1 1996  NA  NA  19
#2 1997  14  NA   9
#3 1998  13  11   8
#4 1999  10  20  17

DATA

df1 <- data.frame(year = c(1999, 1998, 1997),
                  value = c(10, 13, 14))

df2 <- data.frame(year = c(1999, 1998),
                  value = c(20, 11))

df3 <- data.frame(year = c(1999, 1998, 1997, 1996),
                  value = c(17, 8, 9, 19))

答案 2 :(得分:1)

重新考虑@Djork显示的链合并,但请确保创建名为 Index 的实际列等于rownames()。此外,根据 df#重命名X..1列,这也可以避免在合并期间出现重复列警告。 dfs 下面等同于发布的 smalldflist

dfs <- lapply(seq_along(dfs), function(i){
        dfs[[i]]$Index = rownames(dfs[[i]])        # CREATE INDEX
        colnames(dfs[[i]])[1] <- paste0("df", i)   # RENAME X..1 COLUMN

        return(dfs[[i]])
})

dfs[[1]]
#             df1 Index
# 1725   4.552824  1725
# 1726  32.647434  1726
# 1727  52.711602  1727
# 1728 170.932583  1728
# 1729 227.043044  1729
# 1730 191.462399  1730
# 1731 226.940535  1731
# 1732 274.854836  1732
# 1733 336.457600  1733
# 1734 409.132934  1734

finaldf <- Reduce(function(...) merge(..., by="Index", all=TRUE), dfs)

finaldf
#    Index    df1     df2     df3     df4    df5    df6   df7        df8
# 1   1626     NA      NA      NA      NA     NA     NA    NA   1.911345
# 2   1627     NA      NA      NA      NA     NA     NA    NA   3.397947
# 3   1628     NA      NA      NA      NA     NA     NA    NA   5.326885
# 4   1629     NA      NA      NA      NA     NA     NA    NA   6.419216
# 5   1630     NA      NA      NA      NA     NA     NA    NA  11.030760
# ...