我正在尝试将两个数据帧合并在一起,这两个数据帧通过名为patient
的特定变量相互关联。第二个数据帧具有同一患者列的多个条目。我不想在合并时创建重复的患者条目,但是我想通过将一列下的值连接起来在第二个数据框中保留唯一信息。
我尝试使用有效的group_by
手动连接某些变量。我有几个变量,但是手动指定所有变量都不可行
我还可以使用dplyr
连接数据帧中的每个变量,如下所示。第二种情况的问题是重复值也被串联在一起,使数据帧不必要地变大且难以处理。请参阅下面的reprex。
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df1 <- data.frame(patient=c("a", "b", "c"),
var1 = 1:3,
var2=11:13)
df1
#> patient var1 var2
#> 1 a 1 11
#> 2 b 2 12
#> 3 c 3 13
df2 <- data.frame(patient=c("a","a", "b", "b", "c", "c" ),
treatment= rep(c("drug1", "drug2"), 3),
time= rep(c("time1", "time2"), 3),
var3= "constant")
df2
#> patient treatment time var3
#> 1 a drug1 time1 constant
#> 2 a drug2 time2 constant
#> 3 b drug1 time1 constant
#> 4 b drug2 time2 constant
#> 5 c drug1 time1 constant
#> 6 c drug2 time2 constant
df_merged <- left_join(df1, df2)
#> Joining, by = "patient"
# Don't want duplicates like this
df_merged
#> patient var1 var2 treatment time var3
#> 1 a 1 11 drug1 time1 constant
#> 2 a 1 11 drug2 time2 constant
#> 3 b 2 12 drug1 time1 constant
#> 4 b 2 12 drug2 time2 constant
#> 5 c 3 13 drug1 time1 constant
#> 6 c 3 13 drug2 time2 constant
df_merged2 <- df_merged %>%
group_by(patient) %>%
mutate(treatment = paste(treatment, collapse = "_"),
time=paste(time, collapse = "_")) %>%
filter(!duplicated(patient))
# I can manually edit a few variables like this
df_merged2
#> # A tibble: 3 x 6
#> # Groups: patient [3]
#> patient var1 var2 treatment time var3
#> <fct> <int> <int> <chr> <chr> <fct>
#> 1 a 1 11 drug1_drug2 time1_time2 constant
#> 2 b 2 12 drug1_drug2 time1_time2 constant
#> 3 c 3 13 drug1_drug2 time1_time2 constant
df_merged3 <- df_merged %>%
group_by(patient) %>%
mutate_at(vars(-group_cols()), .funs = ~paste(., collapse ="_")) %>%
filter(!duplicated(patient))
# I have many variables I can't specify manually
# I can create this merged data frame, but I don't want to
# concatenate duplicated values such as var1, var2, and var3
df_merged3
#> # A tibble: 3 x 6
#> # Groups: patient [3]
#> patient var1 var2 treatment time var3
#> <fct> <chr> <chr> <chr> <chr> <chr>
#> 1 a 1_1 11_11 drug1_drug2 time1_time2 constant_constant
#> 2 b 2_2 12_12 drug1_drug2 time1_time2 constant_constant
#> 3 c 3_3 13_13 drug1_drug2 time1_time2 constant_constant
由reprex package(v0.3.0)于2019-10-23创建
我想看看是否有一种方法可以连接仅包含唯一值的变量,以保留第二个数据帧中的信息,而无需复制df_merged
中的行。
如果您有除dplyr
以外的其他建议,我将很高兴听到。 data.table
解决方案也可能也适合我,因为我的实际数据帧很大。
谢谢!
答案 0 :(得分:1)
我们可以使用summarise_at
和unique
library(dplyr)
df_merged %>%
group_by(patient) %>%
summarise_at(vars(-group_cols()), .funs = ~paste(unique(.), collapse ="_"))
或者我们可以直接进行合并/联接,而不用中间数据帧添加/更改Global Env。
left_join(df1,
df2 %>% group_by(patient) %>%
summarise_at(vars(-group_cols()), .funs = ~paste(unique(.), collapse ="_")) %>%
ungroup()
)
Joining, by = "patient"
patient var1 var2 treatment time var3
1 a 1 11 drug1_drug2 time1_time2 constant
2 b 2 12 drug1_drug2 time1_time2 constant
3 c 3 13 drug1_drug2 time1_time2 constant
#Here a toy example to experiment with, uncomment browser to see how it works inside Reduce,
#also see ?Reduce for more info
paste_mod <- function(x) Reduce(function(u, v){
u <- ifelse(!grepl('_',u) & is.na(u),'.',u)
v <- ifelse(is.na(v),'.',v)
if(v=='.' | !grepl(v,u)) paste0(u,'_',v) else u
}, x)
paste_mod(c("drug1",NA,NA,"drug2","drug1","drug2"))
[1] "drug1_._._drug2"
paste_mod(c(NA,NA,"drug2","drug1","drug2"))
[1] "._._drug2_drug1"
#replace NA with . then apply Reduce
df2 %>%
mutate_if(is.factor,as.character) %>% mutate_all(~replace(.,is.na(.),'.')) %>%
group_by(patient) %>%
summarise_at(vars(-group_cols()), .funs = ~Reduce(function(u, v) if(v=='.' | !grepl(v,u)) paste0(u,'_',v) else u, .)) %>%
ungroup()
# A tibble: 2 x 4
patient treatment time var3
<chr> <chr> <chr> <chr>
1 a drug1_._._drug2 time1_time2 constant
2 c drug1_drug2 time1_time2 constant
新的df2
用于测试更新的解决方案
df2 <- structure(list(patient = structure(c(1L, 1L, 1L, 1L, 2L, 2L), .Label = c("a",
"c"), class = "factor"), treatment = structure(c(1L, NA, NA,
2L, 1L, 2L), .Label = c("drug1", "drug2"), class = "factor"),
time = structure(c(1L, 2L, 1L, 2L, 1L, 2L), .Label = c("time1",
"time2"), class = "factor"), var3 = structure(c(1L, 1L, 1L,
1L, 1L, 1L), class = "factor", .Label = "constant")), class = "data.frame", row.names = c(NA,
-6L))