Question

我有一个包含500行和20000列的数据框。行包含不同的样品ID，并且行中具有重复的样品ID，但列值不同。我的数据框是这样的.....

sample_name   E002.c1   E004.c1  E005.c1  E007.c1  so on...
T4456-C        0           0        0        0
T4456-C        1           0        0        1
T4456-C        1           0        1        1
T9589-C        0           1        0        0
T9589-C        1           1        0        0

有没有办法像这样合并这些ID

如果列中全部包含0，则将值视为0 如果列中至少包含一个1，则将值视为1。

预期输出：-

sample_name   E002.c1   E004.c1  E005.c1  E007.c1  so on...
T4456-C        1           0        1        1
T9589-C        1           1        0        0

Answer 1

还有tidyverse的可能性：

df %>%
 group_by(sample_name) %>%
 summarise_all(funs(ifelse(any(. == 1), 1, 0)))

  sample_name E002.c1 E004.c1 E005.c1 E007.c1
  <fct>         <dbl>   <dbl>   <dbl>   <dbl>
1 T4456-C          1.      0.      1.      1.
2 T9589-C          1.      1.      0.      0.

它按“ sample_name”分组，然后检查是否有任何值==1。如果是，则赋值为1，否则赋值为0。

或与data.table相同：

setDT(df)[, lapply(.SD, function(x) ifelse(any(x == 1), 1, 0)), by = sample_name]

   sample_name E002.c1 E004.c1 E005.c1 E007.c1
1:     T4456-C       1       0       1       1
2:     T9589-C       1       1       0       0

或仅使用基数R：

aggregate(. ~ sample_name, data = df, function(x) ifelse(any(x == 1), 1, 0))

  sample_name E002.c1 E004.c1 E005.c1 E007.c1
1     T4456-C       1       0       1       1
2     T9589-C       1       1       0       0

或按“ sample_name”分组，然后按@R Yoda的建议汇总最大值：

df %>%
 group_by(sample_name) %>%
 summarise_all(funs(max))

与data.table相同：

setDT(df)[, lapply(.SD, max), by = sample_name]

并使用基数R：

aggregate(. ~ sample_name, data = df, max)

或使用数字除法：

df %>%
 group_by(sample_name) %>%
 summarise_all(funs(any(. %/% 1 == 1)*1))

与data.table相同：

setDT(df)[, lapply(.SD, function(x) any(x %/% 1 == 1)*1), by = sample_name]

和基数R：

aggregate(. ~ sample_name, data = df, function(x) any(x %/% 1 == 1)*1)

Answer 2

尝试一下：

library(tidyverse)

df %>%
  group_by(sample_name) %>%
  summarise_all(sum) %>%
  mutate_if(is.numeric, funs(if_else(. > 0, 1, 0)))

Answer 3

使用aggregate并使用一元运算符+

的基本R选项

aggregate(. ~ sample_name, data = df, function(x) +(sum(x) > 0))
#  sample_name E002.c1 E004.c1 E005.c1 E007.c1
#1     T4456-C       1       0       1       1
#2     T9589-C       1       1       0       0

这避免了任何明确的ifelse条件。

样本数据

df <- read.table(text =
    "sample_name   E002.c1   E004.c1  E005.c1  E007.c1
T4456-C        0           0        0        0
T4456-C        1           0        0        1
T4456-C        1           0        1        1
T9589-C        0           1        0        0
T9589-C        1           1        0        0", header = T)

微基准分析

下面是使用由microbenchmark行组成的较大数据集对本文中介绍的所有方法进行N=10^6分析的结果：

# Generate sample with 10^6 rows
N <- 10^6
df <- data.frame(
    sample_name = sample(letters[1:10], N, replace = T),
    col1 = sample(c(1, 0), N, replace = T),
    col2 = sample(c(1, 0), N, replace = T),
    col3 = sample(c(1, 0), N, replace = T),
    col4 = sample(c(1, 0), N, replace = T))

# Microbenchmark analysis
library(microbenchmark)
res <- microbenchmark(
    tidyverse_ifelse_any = {
        df %>% group_by(sample_name) %>% summarise_all(funs(ifelse(any(. == 1), 1, 0)))
    },
    tidyverse_max = {
        df %>% group_by(sample_name) %>% summarise_all(funs(max))
    },
    tidyverse_any_int_div = {
        df %>% group_by(sample_name) %>% summarise_all(funs(any(. %/% 1 == 1)*1))
    },
    tidyverse_mutate_if_ifelse = {
        df %>% group_by(sample_name) %>% summarise_all(sum) %>% mutate_if(is.numeric, funs(if_else(. > 0, 1, 0)))
    },
    baseR_ifelse_any = {
        aggregate(. ~ sample_name, data = df, function(x) ifelse(any(x == 1), 1, 0))
    },
    baseR_max = {
        aggregate(. ~ sample_name, data = df, max)
    },
    baseR_any_int_div = {
        aggregate(. ~ sample_name, data = df, function(x) any(x %/% 1 == 1)*1)
    },
    baseR_sum_unary_plus = {
        aggregate(. ~ sample_name, data = df, function(x) +(sum(x) > 0))
    },
    datatable_ifelse_any = {
        setDT(df)[, lapply(.SD, function(x) ifelse(any(x == 1), 1, 0)), by = sample_name]
    },
    datatable_any_int_div = {
        setDT(df)[, lapply(.SD, function(x) any(x %/% 1 == 1)*1), by = sample_name]
    }
)

res
#Unit: milliseconds
#                       expr        min         lq       mean     median
#       tidyverse_ifelse_any   79.54145   87.49671  101.44983   96.69517
#              tidyverse_max   60.85648   66.54888   75.71105   70.26009
#      tidyverse_any_int_div  130.17937  139.99099  158.74449  152.59370
# tidyverse_mutate_if_ifelse   60.63313   66.42935   75.17535   70.19083
#           baseR_ifelse_any  933.11576 1070.73916 1157.92271 1121.52533
#                  baseR_max  895.94086 1046.37304 1121.74497 1097.73445
#          baseR_any_int_div 1003.90893 1115.72278 1179.91529 1138.17459
#       baseR_sum_unary_plus  903.09797 1049.83542 1127.51391 1099.56222
#       datatable_ifelse_any   93.47955   97.21338  111.67774  100.98314
#      datatable_any_int_div  157.81882  164.51094  179.08096  173.94033
#         uq       max neval    cld
#  109.08000  259.4346   100 ab
#   80.39179  142.4100   100 a
#  166.56710  349.8669   100   c
#   76.91358  253.4256   100 a
# 1187.60461 1775.9125   100     ef
# 1167.16448 1544.4371   100    d
# 1218.67363 1592.0093   100      f
# 1196.53435 1375.8022   100    de
#  115.57745  282.7197   100  b
#  187.37031  317.1613   100   c

library(ggplot2)
autoplot(res)

提供我没有记错的话，我对此感到惊讶

基本R解决方案比tidyverse / data.table解决方案要慢得多（毕竟，tidyverse代码通常不是关于有效代码，而是关于干净代码），并且
data.table解决方案并没有比tidyverse / base R解决方案快得多。

调整数据框的形状，如果存在0和1，则考虑1

3 个答案:

样本数据

微基准分析