重复次数

时间:2018-11-21 17:22:08

标签: r dplyr

我有一个数据框:

df <- data.frame(sample = c('S1', 'S1', 'S2', 'S3', 'S4', 'S4'), event = c(1,1,4,2,3,12), start = c(100, 20, 30, 500, 300, 200), end = c(350, 480, 60, 700, 300, 200))

 sample event start end
     S1     1   100 350
     S1     1    20 480
     S2     4    30  60
     S3     2   500 700
     S4     3   300 300
     S4    12   200 200

我想计算每个events中不同的sample的数量,并更改sample的名称以反映这一点。

例如sample S4有两个不同的事件,312。在这里,我想获得这个结果:

 sample event start end
     S1     1   100 350
     S1     1    20 480
     S2     4    30  60
     S3     2   500 700
     S4.1     3   300 300
     S4.2    12   200 200

这就是我正在尝试的方法,它会产生S4.2S4.2

df %>% 
    group_by(sample) %>% 
    dplyr::mutate(event_count = n_distinct(event)) %>% 
    dplyr::mutate(sample_mod = as.character(ifelse(event_count == 1, as.character(sample), paste(sample, event_count, sep = '.'))))

sample event start   end event_count sample_mod
1 S1         1   100   350           1 S1        
2 S1         1    20   480           1 S1        
3 S2         4    30    60           1 S2        
4 S3         2   500   700           1 S3        
5 S4         3   300   300           2 S4.2      
6 S4        12   200   200           2 S4.2 

如何修改此中间管道以实现所需的输出?

2 个答案:

答案 0 :(得分:2)

按“样本”分组后,获取“事件”中不同元素的数量,创建一个逻辑条件,以将“样本”中的值修改为唯一值(make.unique

df %>% 
  group_by(sample) %>%
  mutate(n = n_distinct(event)) %>% 
  ungroup %>% 
  mutate(sample_mod = case_when(n >1 ~ make.unique(as.character(sample)), 
     TRUE ~ as.character(sample)))
# A tibble: 6 x 6
#  sample event start   end     n sample_mod
#  <fct>  <dbl> <dbl> <dbl> <int> <chr>     
#1 S1         1   100   350     1 S1        
#2 S1         1    20   480     1 S1        
#3 S2         4    30    60     1 S2        
#4 S3         2   500   700     1 S3        
#5 S4         3   300   300     2 S4        
#6 S4        12   200   200     2 S4.1      

答案 1 :(得分:2)

library(data.table)
setDT(df)

df[order(event)
   , sample :=  {
      rid <- rleid(event)
      if(any(rid > 1)) paste0(sample, '.', rid)
      else sample }
   , by = sample]
df
#    sample event start end
# 1:     S1     1   100 350
# 2:     S1     1    20 480
# 3:     S2     4    30  60
# 4:     S3     2   500 700
# 5:   S4.1     3   300 300
# 6:   S4.2    12   200 200

使用的数据:(注意stringsAsFactors = F

df <- data.frame(sample = c('S1', 'S1', 'S2', 'S3', 'S4', 'S4'), event = c(1,1,4,2,3,12), start = c(100, 20, 30, 500, 300, 200), end = c(350, 480, 60, 700, 300, 200), stringsAsFactors = F)

基准:

dt <- function(df){
  setDT(df)
  df[order(event)
   , sample :=  {
      rid <- rleid(event)
      if(any(rid > 1)) paste0(sample, '.', rid)
      else sample }
   , by = sample]
}

dply <- function(df){
  df %>% 
  group_by(sample) %>%
  mutate(n = n_distinct(event)) %>% 
  ungroup %>% 
  mutate(sample = case_when(n >1 ~ make.unique(as.character(sample)), 
     TRUE ~ as.character(sample)))
}

df <- rbindlist(replicate(1000, df, simplify = F))

microbenchmark::microbenchmark(dt(df), dply(df))
# Unit: milliseconds
#      expr      min       lq     mean   median       uq       max neval
#    dt(df) 1.750972 1.970664 2.332920 2.075279 2.391176  8.306448   100
#  dply(df) 5.982349 6.277939 7.046036 6.566759 7.036501 15.112181   100