基于R中的列合并100个文件

时间:2014-10-25 17:09:34

标签: r

我有这样的数据帧(100):

df1
  v1 v2 v3
1  1  2  a
2  2  3  b
3  3  4  c
4  4  5  d
5  3  5  e

df2
  v1 v2 v3
1  1  2  j
2  2  3  i
3  3  4  t
4  3  5  r

df3
  v1 v2 v3
1  2  3  t
2  2  4  g
3  6  7  i
4  8  9  t

现在,我想根据第三列中的值合并它们并形成一个文件,这样,如果第三列中有相应的值,那么序列中将有一个值0。 因此,生成的文件将是

  v1 v2  v3
1  1  2 aj0
2  2  3 bit
3  2  4 00g
4  3  4 ct0
5  3  5 er0
6  4  5 d00
7  6  7 00i
8  8  9 00t

我能够为此提出一个代码,如下所示:

> df12 <- merge(df1, df2, by=c("v1", "v2"), all=TRUE)
> df123 <- merge(df12, df3, by=c("v1", "v2"), all=TRUE)
> df123$v3 <- paste0(df123[,3], df123[,4], df123[,5])
> df123$v3.x <- df123$v3.y <- NULL
> df123$v3 <- gsub("NA", "0", df123$v3)

但是,我有100个数据帧,我怎么能用它们呢?谢谢!

2 个答案:

答案 0 :(得分:4)

使用dplyr

的另一种方法
library(dplyr)
library(tidyr)

unnest(dfs,group) %>%
                  spread(group, v3, fill=0) %>% 
                    unite(v3, df1:df3, sep="")
 #  v1 v2  v3
 #1  1  2 aj0
 #2  2  3 bit
 #3  2  4 00g
 #4  3  4 ct0
 #5  3  5 er0
 #6  4  5 d00
 #7  6  7 00i
 #8  8  9 00t

或使用data.table

 library(data.table)
 dt <- rbindlist(Map(`cbind`,group=names(dfs),dfs))
 dcast.data.table(dt, v1+v2~group, value.var="v3", fill="0")[,
                list(v3=do.call(`paste0`, .SD)), by=list(v1,v2), .SDcols=names(dfs)]

 #   v1 v2  v3
 #1:  1  2 aj0
 #2:  2  3 bit
 #3:  2  4 00g
 #4:  3  4 ct0
 #5:  3  5 er0
 #6:  4  5 d00
 #7:  6  7 00i
 #8:  8  9 00t

更新

如果您有文件P1.txtP2.txt等,则可以将文件读入列表,然后应用上述代码。假设文件在工作目录中

  files <- list.files(pattern="^P\\d+")
  dfs <-lapply(files, function(x) read.table(x, header=TRUE, sep="") #or use `fread` from `data.table`
  nm1 <- gsub("\\..*", "", files)
  dfs <- setNames(dfs, nm1)

您需要将df1:df3代码中的dplyr替换为P1:P100

  unnest(dfs,group) %>%
              spread(group, v3, fill=0) %>% 
                unite(v3, P1:P100, sep="")


###data

df1 <- structure(list(v1 = c(1L, 2L, 3L, 4L, 3L), v2 = c(2L, 3L, 4L, 
5L, 5L), v3 = c("a", "b", "c", "d", "e")), .Names = c("v1", "v2", 
"v3"), class = "data.frame", row.names = c("1", "2", "3", "4", 
"5"))

df2 <- structure(list(v1 = c(1L, 2L, 3L, 3L), v2 = 2:5, v3 = c("j", 
"i", "t", "r")), .Names = c("v1", "v2", "v3"), class = "data.frame", 
 row.names = c("1", "2", "3", "4"))

df3 <- structure(list(v1 = c(2L, 2L, 6L, 8L), v2 = c(3L, 4L, 7L, 9L), 
 v3 = c("t", "g", "i", "t")), .Names = c("v1", "v2", "v3"), class = "data.frame",
 row.names = c("1", "2", "3", "4"))

 dfs <- mget(ls(pattern="^df\\d+"))

答案 1 :(得分:3)

您可以使用Reduce操作数据框列表来执行此操作:

dfs <- list(df1, df2, df3)
merged <- Reduce(function(x, y) merge(x, y, by=c("v1", "v2"), all=T), dfs)
v3 <- gsub("NA", "0", apply(merged[,c(-1, -2)], 1, paste, collapse=""))
data.frame(v1=merged$v1, v2=merged$v2, v3)
#   v1 v2  v3
# 1  1  2 aj0
# 2  2  3 bit
# 3  2  4 00g
# 4  3  4 ct0
# 5  3  5 er0
# 6  4  5 d00
# 7  6  7 00i
# 8  8  9 00t