我有两个具有相同结构的数据帧 - 都有两个ID列和25个字符串数据列。我想加入这两个并在ID匹配时连接数据列中的字符串。所以,例如:
df_1:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A
df_2:
id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B
合并后,这应该给出
df_combined:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A, B A ... <NA>
a1 b2 A B ... A, B
a1 b3 B <NA> ... B
a2 b1 <NA> <NA> ... A
当我尝试使用连接或合并时,它会重复除ID列之外的所有内容(因此我最终得到50个数据列)。我需要使用其他东西吗?
谢谢!
答案 0 :(得分:3)
如果您没有任何空字符串,则可以执行此操作:
library(dplyr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
`[<-`(.=="",value=NA)
使用magrittr
,您可以避免不那么漂亮'[<-'
并将其替换为inset
library(magrittr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
inset(.=="",value=NA)
答案 1 :(得分:2)
使用melt()
和dcast()
重塑数据的替代解决方案:
library(data.table)
rbind(setDT(df_1), setDT(df_2))[
, melt(.SD, measure.var = patterns("col"), na.rm = TRUE)][
, dcast(.SD, id_1 + id_2 ~ variable, toString, fill = NA)]
id_1 id_2 col_1 col2 col_25 1: a1 b1 A, B A NA 2: a1 b2 A B A, B 3: a1 b3 B NA B 4: a2 b1 NA NA A
df_1 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A",
drop = 5L, na.strings = "<NA>"
)
df_2 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B",
drop = 5L, na.strings = "<NA>"
)
答案 2 :(得分:1)
详细说明@ zx8754评论的想法,并使用dplyr
包,
library(dplyr)
df1 %>%
bind_rows(df2) %>%
mutate_at(vars(-contains('id')), funs(replace(., is.na(.), ''))) %>%
group_by(id_1, id_2) %>%
summarise_all(funs(trimws(paste(., collapse = ' ')))) %>%
mutate_all(funs(replace(., . == '', NA)))
给出,
# A tibble: 4 x 5 # Groups: id_1 [2] id_1 id_2 col_1 col2 col_25 <chr> <chr> <chr> <chr> <chr> 1 a1 b1 A B A <NA> 2 a1 b2 A B A B 3 a1 b3 B <NA> B 4 a2 b1 <NA> <NA> A
注意:强>
NA
(不是字符)as.character
数据强>
dput(df1)
structure(list(id_1 = c("a1", "a1", "a2"), id_2 = c("b1", "b2",
"b1"), col_1 = c("A", "A", NA), col2 = c("A", NA, NA), col_25 = c(NA,
"A", "A")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")
> dput(df2)
structure(list(id_1 = c("a1", "a1", "a1"), id_2 = c("b1", "b2",
"b3"), col_1 = c("B", NA, "B"), col2 = c(NA, "B", NA), col_25 = c(NA,
"B", "B")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")