我们说我有三张表重叠。
A B C D
A 12 16 17 14
B 62 66 9 85
C 37 31 59 75
D 74 76 89 25
A B E F
A 12 16 11 19
B 62 66 57 28
E 24 21 4 51
F 7 1 68 22
C D E F
C 59 75 77 80
D 89 25 88 30
E 67 87 4 51
F 39 69 68 22
我想按行和按列组合它们,没有任何重复的行或列,并且行和列名称保持不变。
A B C D E F
A 12 16 17 14 11 19
B 62 66 9 85 57 28
C 37 31 59 75 77 80
D 74 76 89 25 88 30
E 24 21 67 87 4 51
F 7 1 39 69 68 22
三天后,我设法将这一点拼凑起来(在here,here,here的帮助下,以及可能已经忘记的其他人的帮助下:
#Import tables as dataframes
file.names <- dir(pattern = ".tab")
for(i in 1:length(file.names)){
nam <- paste("table.", i, sep = "") #rename the data as table.1 ... table.n
assign(nam, as.data.frame(as.matrix(read.delim(file.names[i],
row.names=1, header=TRUE, sep="\t", stringsAsFactors=FALSE))))
}
#Import an empty file (i.e. just column and row names)
#that you will fill with your smaller data tables
out.file <- as.data.frame(as.matrix(read.delim("Blank_table.csv",
row.names=1, header=TRUE, sep=",")))
#Create a list of the dataframes
file.names = lapply(ls(pattern = "table.[0-9]"), get)
#Add columns that we can use for merging
#because using 'merge' on dataframes destroys row names
out.file$rows <- rownames(out.file)
for(i in 1:length(file.names)){
rownams <- rownames(file.names[[i]])
file.names[i] <- lapply(file.names[i], cbind, rows = rownams)
}
#Combine the tables
for(i in 1:length(file.names)){
file <- file.names[i]
out.file <- aggregate(. ~ rows, data = merge(out.file, file, all = TRUE),
na.action = na.pass, FUN = mean, na.rm = TRUE)
}
这就是我想要的,但是当我合并数百个表时需要很长时间。我觉得可能有一种更简单的方法可以做到,但我不想再花三天的时间来试错。
我想象的是这样的事情:
有什么建议吗?
更新:以下是dput
的示例表:
table.1 <- structure(list(A = c(12L, 62L, 37L, 74L), B = c(16L, 66L, 31L,
76L), C = c(17L, 9L, 59L, 89L), D = c(14L, 85L, 75L, 25L)), .Names = c("A",
"B", "C", "D"), row.names = c("A", "B", "C", "D"), class = "data.frame")
table.2 <- structure(list(A = c(12L, 62L, 24L, 7L), B = c(16L, 66L, 21L,
1L), E = c(11L, 57L, 4L, 68L), F = c(19L, 28L, 51L, 22L)), .Names = c("A",
"B", "E", "F"), row.names = c("A", "B", "E", "F"), class = "data.frame")
table.3 <- structure(list(C = c(59L, 89L, 67L, 39L), D = c(75L, 25L, 87L,
69L), E = c(77L, 88L, 4L, 68L), F = c(80L, 30L, 51L, 24L)), .Names = c("C",
"D", "E", "F"), row.names = c("C", "D", "E", "F"), class = "data.frame")
out.file <- structure(list(A = c(NA, NA, NA, NA, NA, NA), B = c(NA, NA, NA,
NA, NA, NA), C = c(NA, NA, NA, NA, NA, NA), D = c(NA, NA, NA,
NA, NA, NA), E = c(NA, NA, NA, NA, NA, NA), F = c(NA, NA, NA,
NA, NA, NA)), .Names = c("A", "B", "C", "D", "E", "F"), row.names = c("A",
"B", "C", "D", "E", "F"), class = "data.frame")
答案 0 :(得分:2)
子集化解决方案,没有额外的包(使用@emehex定义的df1,df2和df3):
# List of dataframes to combine
DF<-list(df1, df2, df3)
COL<-unique(unlist(lapply(DF, colnames)))
ROW<-unique(unlist(lapply(DF, rownames)))
# Empty DF with all combinations
TOTAL<-matrix(data=NA, nrow=length(ROW), ncol=length(COL), dimnames=list(ROW, COL))
# Subsetting :
for (df in DF) {
TOTAL[rownames(df), colnames(df)] <- as.matrix(df)
}
子集比合并更快,有许多数据帧可能更有效(参见@aichao回答她:For each row extract the value in the column name that match another value in the cell)。您只需根据代码调整DF
列表file.names
。
答案 1 :(得分:0)
不知道你的.csvs是什么样的,所以这是我能做的最好的事情(上面有三个示例表)......
数据导入
"2001-07-30"
df1 <- read.table(header = TRUE, text =
"A B C D
A 12 16 17 14
B 62 66 9 85
C 37 31 59 75
D 74 76 89 25")
df2 <- read.table(header = TRUE, text =
"A B E F
A 12 16 11 19
B 62 66 57 28
E 24 21 4 51
F 7 1 68 22")
df3 <- read.table(header = TRUE, text =
"C D E F
C 59 75 77 80
D 89 25 88 30
E 67 87 4 51
F 39 69 68 22")
,dplyr
和tibble
tidyr
输出
library(dplyr)
library(tibble)
library(tidyr)
# intermediate tables for rownames and gathering
df1_c <- df1 %>%
rownames_to_column("Name") %>%
gather(key, value, -Name)
df2_c <- df2 %>%
rownames_to_column("Name") %>%
gather(key, value, -Name)
df3_c <- df3 %>%
rownames_to_column("Name") %>%
gather(key, value, -Name)
# formatted dataframe from spread
df <- bind_rows(df1_c, df2_c, df3_c) %>%
group_by(Name, key) %>%
distinct(.keep_all = TRUE) %>%
spread(key, value)