如何合并这些数据框并采用列方式?

时间:2016-06-01 02:29:24

标签: r dataframe merge mean

我合并了这两个数据帧,我想取列的平均值并将它们合并为一个数据帧。这是我的数据框合并:

           ARN     ADOPTION.x    EUTHANASIA.x     OTHER.x           ADOPTION.y     EUTHANASIA.y     OTHER.y
99645 A1428364 0.998701572418 0.0000001962661 0.001298154    0.990897715091705 0.00000014520751 0.009102128
99646 A1428368 0.000122911530 0.9927836656570 0.007093379  0.00270643248222768 0.00152872898616 0.995764852
99647 A1428369 0.978477597237 0.0000004987147 0.021521868    0.933309495449066 0.00000007226861 0.066690445
99648 A1428370 0.004124862142 0.8476260900497 0.148249015 0.000234879378695041 0.87564605474472 0.124119066
99649 A4081681 0.464215517044 0.0024338855874 0.533350587    0.699311196804047 0.04458642378449 0.256102324
99650 A4087524 0.000006879755 0.0021564548369 0.997836649 0.000240802211919799 0.00144882441964 0.998310328

例如,我想采用每列采用的平均值x + adoption.y,然后将列#34采用。"然后重复每列。我该怎么做呢?

3 个答案:

答案 0 :(得分:2)

我们可以对以.x.y结尾的列进行子集并取平均值

d1 <- (df1[grep("\\.x$", names(df1))] + df1[grep("\\.y$", names(df1))])/2
colnames(d1) <- c("ADOPTION",   "EUTHANASIA", "OTHER")
d2 <- cbind(df1["ARN"], d1)
d2
#        ARN    ADOPTION      EUTHANASIA       OTHER
#99645 A1428364 0.994799644 0.0000001707368 0.005200141
#99646 A1428368 0.001414672 0.4971561973216 0.501429116
#99647 A1428369 0.955893546 0.0000002854917 0.044106157
#99648 A1428370 0.002179871 0.8616360723972 0.136184040
#99649 A4081681 0.581763357 0.0235101546859 0.394726456
#99650 A4087524 0.000123841 0.0018026396283 0.998073488

或另一个选项是melt为长格式,然后使用dcast转换为'wide'

library(data.table)
dM <- melt(setDT(df1), measure = patterns("\\.x$", "\\.y$"), value.name = 
       c("x", "y"))[, variable := c("ADOPTION",   "EUTHANASIA", "OTHER")[variable]
       ][, .(ARN, variable, value = (x+y)/2)]
dcast(dM, ARN~variable, value.var="value")
#      ARN    ADOPTION      EUTHANASIA       OTHER
#1: A1428364 0.994799644 0.0000001707368 0.005200141
#2: A1428368 0.001414672 0.4971561973216 0.501429116
#3: A1428369 0.955893546 0.0000002854917 0.044106157
#4: A1428370 0.002179871 0.8616360723972 0.136184040
#5: A4081681 0.581763357 0.0235101546859 0.394726456
#6: A4087524 0.000123841 0.0018026396283 0.998073488

答案 1 :(得分:1)

@akrun建议的长格式+聚合方法的变体:

datlong <- reshape(dat, idvar="ARN", sep=".", direction="long", varying=-1)
nms <- names(datlong)[-c(1,2)]
datlong$seq <- with(datlong, ave(time,time,FUN=seq_along) )
aggregate(datlong[nms], datlong[c("ARN","seq")], FUN=mean)[-2]

#       ARN    ADOPTION   EUTHANASIA       OTHER
#1 A1428364 0.994799644 1.707368e-07 0.005200141
#2 A1428368 0.001414672 4.971562e-01 0.501429116
#3 A1428369 0.955893546 2.854917e-07 0.044106157
#4 A1428370 0.002179871 8.616361e-01 0.136184040
#5 A4081681 0.581763357 2.351015e-02 0.394726456
#6 A4087524 0.000123841 1.802640e-03 0.998073488

答案 2 :(得分:0)

Hadleyverse版本:

library(dplyr)
library(tidyr)

       # melt data into long form
df %>% gather(var, val, -ARN) %>% 
    # separate suffixes into new column
    separate(var, c('var', 'origin')) %>% 
    # spread back to wide form
    spread(var, val) %>% 
    # set grouping for aggregation
    group_by(ARN) %>% 
    # aggregate each variable within each group
    summarise_each(funs(mean), -origin)

# Source: local data frame [6 x 4]
# 
#        ARN    ADOPTION   EUTHANASIA       OTHER
#     (fctr)       (dbl)        (dbl)       (dbl)
# 1 A1428364 0.994799644 1.707368e-07 0.005200141
# 2 A1428368 0.001414672 4.971562e-01 0.501429116
# 3 A1428369 0.955893546 2.854917e-07 0.044106157
# 4 A1428370 0.002179871 8.616361e-01 0.136184040
# 5 A4081681 0.581763357 2.351015e-02 0.394726456
# 6 A4087524 0.000123841 1.802640e-03 0.998073488

请注意,不是合并,而是按ID列,行绑定和聚合分组要容易得多,即

bind_rows(df1, df2) %>% 
    group_by(ARN) %>% 
    summarise_each(funs(mean))

或在基础R

aggregate(. ~ ARN, rbind(df1, df2), mean)

data.table

library(data.table)

setDT(rbind(df1, df2))[, lapply(.SD, mean), by = ARN]

数据

df <- structure(list(ARN = structure(1:6, .Label = c("A1428364", "A1428368", 
    "A1428369", "A1428370", "A4081681", "A4087524"), class = "factor"), 
        ADOPTION.x = c(0.998701572418, 0.00012291153, 0.978477597237, 
        0.004124862142, 0.464215517044, 6.879755e-06), EUTHANASIA.x = c(1.962661e-07, 
        0.992783665657, 4.987147e-07, 0.8476260900497, 0.0024338855874, 
        0.0021564548369), OTHER.x = c(0.001298154, 0.007093379, 0.021521868, 
        0.148249015, 0.533350587, 0.997836649), ADOPTION.y = c(0.990897715091705, 
        0.00270643248222768, 0.933309495449066, 0.000234879378695041, 
        0.699311196804047, 0.000240802211919799), EUTHANASIA.y = c(1.4520751e-07, 
        0.00152872898616, 7.226861e-08, 0.87564605474472, 0.04458642378449, 
        0.00144882441964), OTHER.y = c(0.009102128, 0.995764852, 
        0.066690445, 0.124119066, 0.256102324, 0.998310328)), .Names = c("ARN", 
    "ADOPTION.x", "EUTHANASIA.x", "OTHER.x", "ADOPTION.y", "EUTHANASIA.y", 
    "OTHER.y"), row.names = c(NA, -6L), class = "data.frame")

df1 <- structure(list(ARN = structure(1:6, .Label = c("A1428364", "A1428368", 
    "A1428369", "A1428370", "A4081681", "A4087524"), class = "factor"), 
        ADOPTION = c(0.998701572418, 0.00012291153, 0.978477597237, 
        0.004124862142, 0.464215517044, 6.879755e-06), EUTHANASIA = c(1.962661e-07, 
        0.992783665657, 4.987147e-07, 0.8476260900497, 0.0024338855874, 
        0.0021564548369), OTHER = c(0.001298154, 0.007093379, 0.021521868, 
        0.148249015, 0.533350587, 0.997836649)), row.names = c(NA, 
    -6L), class = "data.frame", .Names = c("ARN", "ADOPTION", "EUTHANASIA", 
    "OTHER"))

df2 <- structure(list(ARN = structure(1:6, .Label = c("A1428364", "A1428368", 
    "A1428369", "A1428370", "A4081681", "A4087524"), class = "factor"), 
        ADOPTION = c(0.990897715091705, 0.00270643248222768, 0.933309495449066, 
        0.000234879378695041, 0.699311196804047, 0.000240802211919799
        ), EUTHANASIA = c(1.4520751e-07, 0.00152872898616, 7.226861e-08, 
        0.87564605474472, 0.04458642378449, 0.00144882441964), OTHER = c(0.009102128, 
        0.995764852, 0.066690445, 0.124119066, 0.256102324, 0.998310328
        )), row.names = c(NA, -6L), class = "data.frame", .Names = c("ARN", 
    "ADOPTION", "EUTHANASIA", "OTHER"))