累积计数粘贴

时间:2018-04-02 09:15:03

标签: r dplyr

我有这个数据集:

  ID Set Type Count
1  1   1    A    NA
2  2   1    R    NA
3  3   1    R    NA
4  4   1    U    NA
5  5   1    U    NA
6  6   1    U    NA
7  7   2    A    NA
8  8   3    R    NA
9  9   3    R    NA

作为dputs

mystart <- structure(list(ID = 1:9, Set = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
3L, 3L), Type = structure(c(1L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 2L
), .Label = c("A", "R", "U"), class = "factor"), Count = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("ID", "Set", "Type", 
"Count"), class = "data.frame", row.names = c(NA, -9L))

使用dplyr包我该如何获得:

  ID Set Type  Count
1  1   1    A     A1
2  2   1    R   A1R1
3  3   1    R   A1R2
4  4   1    U A1R2U1
5  5   1    U A1R2U2
6  6   1    U A1R2U3
7  7   2    A     A1
8  8   3    R     R1
9  9   3    R     R2

再次dputs

myend <- structure(list(ID = 1:9, Set = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
3L, 3L), Type = structure(c(1L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 2L
), .Label = c("A", "R", "U"), class = "factor"), Count = structure(c(1L, 
2L, 3L, 4L, 5L, 6L, 1L, 7L, 8L), .Label = c("A1", "A1R1", "A1R2", 
"A1R2U1", "A1R2U2", "A1R2U3", "R1", "R2"), class = "factor")), .Names = c("ID", 
"Set", "Type", "Count"), class = "data.frame", row.names = c(NA, 
-9L))

<小时/> 简而言之,我想计算列"type"中列"set"的观察结果,并累计打印此count(text)

检查类似的帖子,我接近这个:

myend <- structure(list(ID = 1:9, Set = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
3L, 3L), Type = structure(c(1L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 2L
), .Label = c("A", "R", "U"), class = "factor"), Count = c(1L, 
1L, 2L, 1L, 2L, 3L, 1L, 1L, 2L)), .Names = c("ID", "Set", "Type", 
"Count"), class = "data.frame", row.names = c(NA, -9L))

使用代码:

library(dplyr)
myend <- read.table("mydata.txt", header=TRUE, fill=TRUE)
    myend %>%
    group_by(Set, Type) %>%
    mutate(Count = seq(n())) %>%
    ungroup(myend)

非常感谢你的帮助,

5 个答案:

答案 0 :(得分:7)

Base R版本:

aggregateGroup <- function(x){

  vecs <- Reduce(f=function(a,b){ a[b] <- sum(a[b],1L,na.rm=TRUE); a },
                 init=integer(0),
                 as.character(x), 
                 accumulate = TRUE)
  # vecs is a list with something like this :
  # [[1]]
  # integer(0)
  # [[2]]
  # A 
  # 1 
  # [[3]]
  # A R 
  # 1 1 
  # ...
  # so we simply turn those vectors into characters using vapply and paste 
  # (excluding the first)
  vapply(vecs,function(y) paste0(names(y),y,collapse=''),FUN.VALUE='')[-1]
}

split(mystart$Count,mystart$Set) <- lapply(split(mystart$Type,mystart$Set), aggregateGroup)

> mystart
  ID Set Type  Count
1  1   1    A     A1
2  2   1    R   A1R1
3  3   1    R   A1R2
4  4   1    U A1R2U1
5  5   1    U A1R2U2
6  6   1    U A1R2U3
7  7   2    A     A1
8  8   3    R     R1
9  9   3    R     R2

答案 1 :(得分:6)

dplyr版本:

mystart %>%
  group_by(Set) %>%
  mutate(Count = paste0('A', cumsum(Type == 'A'), 
                        'R', cumsum(Type == 'R'),
                        'U', cumsum(Type == 'U'))) %>%
  ungroup()

哪个收益

# A tibble: 9 x 4
     ID   Set Type  Count 
  <int> <int> <chr> <chr> 
1     1     1 A     A1R0U0
2     2     1 R     A1R1U0
3     3     1 R     A1R2U0
4     4     1 U     A1R2U1
5     5     1 U     A1R2U2
6     6     1 U     A1R2U3
7     7     2 A     A1R0U0
8     8     3 R     A0R1U0
9     9     3 R     A0R2U0

<小时/> 如果你想省略计数为零的变量,你需要像它一样包围一个函数

mygroup <- function(lst) {
  name <- names(lst)
  vectors <- lapply(seq_along(lst), function(i) {
    x <- lst[[i]]
    char <- name[i]
    x <- ifelse(x == 0, "", paste0(char, x))
    return(x)
  })
  return(do.call("paste0", vectors))
}

mystart %>%
  group_by(Set) %>%
  mutate(Count = mygroup(list(A = cumsum(Type == 'A'),
                               R = cumsum(Type == 'R'), 
                               U = cumsum(Type == 'U')))) %>%
  ungroup()

这会产生

# A tibble: 9 x 4
     ID   Set Type  Count 
  <int> <int> <chr> <chr> 
1     1     1 A     A1    
2     2     1 R     A1R1  
3     3     1 R     A1R2  
4     4     1 U     A1R2U1
5     5     1 U     A1R2U2
6     6     1 U     A1R2U3
7     7     2 A     A1    
8     8     3 R     R1    
9     9     3 R     R2  

答案 2 :(得分:4)

一行解决data.table

你必须先做

require(data.table)
mystart <- as.data.table(mystart)

然后只使用一行

mystart[, .(Type,
            count = paste0(
              'A',
              cumsum(Type == 'A'),
              'R',
              countR = cumsum(Type == 'R'),
              'U',
              countU = cumsum(Type == 'U')
            )),
        by = c('Set')]

首先,您需要每个类型的cumsum并通过&#39; set&#39;

将它们粘贴在一起

cumsum(Type=='A')等于计数,因为当Type==A时,它是1,否则它是0。

您也希望将它们粘贴到一列中。因此,paste0()很好用。

您仍然需要Type列,因此我在行中添加了Type

输出:

   Set Type  count
1:   1    A A1R0U0
2:   1    R A1R1U0
3:   1    R A1R2U0
4:   1    U A1R2U1
5:   1    U A1R2U2
6:   1    U A1R2U3
7:   2    A A1R0U0
8:   3    R A0R1U0
9:   3    R A0R2U0

希望这有帮助。

顺便说一下,如果你想忽略count 0,你必须自己设计一些if-esle条款。

基本上你想要这个:如果cumsum(something) ==0NULL,esle paste0('something', cumsum(something)),那么你paste0()他们在一起。

它会变得讨厌,我不会写它。你明白了这个想法

答案 3 :(得分:3)

为此问题提供了许多优雅的解决方案。我仍在寻找dplyr方式(在固定类型上没有 - cumsum)。该函数足够通用,可以处理Type的其他值。

借助custom function作为解决方案:

library(dplyr)
mystart %>% group_by(Set, Type) %>%
  mutate(type_count = row_number()) %>%
  mutate(TypeMod = paste0(Type,type_count)) %>%
  group_by(Set) %>%
  mutate(Count = cumCat(TypeMod, type_count)) %>%
  select(-type_count, -TypeMod)


cumCat <- function(x, y){
  retVal <- character(length(x))
  prevVal = ""
  lastGrpVal = ""
  for ( i in seq_along(x)){
    if(y[i]==1){
      lastGrpVal = prevVal
    }
    retVal[i] = paste0(lastGrpVal,x[i])
    prevVal = retVal[i]     
  }
  retVal
}

# # Groups: Set [3]
#      ID   Set Type   Count 
#     <int> <int> <fctr> <chr> 
# 1     1     1 A      A1    
# 2     2     1 R      A1R1  
# 3     3     1 R      A1R2  
# 4     4     1 U      A1R2U1
# 5     5     1 U      A1R2U2
# 6     6     1 U      A1R2U3
# 7     7     2 A      A1    
# 8     8     3 R      R1    
# 9     9     3 R      R2  

答案 4 :(得分:3)

这是一个基本解决方案。

我们可以将原始字母粘贴到字母组的seq_along以获取最后2个字符,然后使用paste将结果Reduce粘贴到上一个结果的最后一个元素。

除此之外,我们使用ave按组计算。

fun <- function(x,y) paste0(x[length(x)],y,seq_along(y))

mystart$Count <- ave(as.character(mystart$Type),mystart$Set,
  FUN = function(x) unlist(Reduce(fun,split(x,x),init=NULL,accumulate = TRUE)))

#   ID Set Type  Count
# 1  1   1    A     A1
# 2  2   1    R   A1R1
# 3  3   1    R   A1R2
# 4  4   1    U A1R2U1
# 5  5   1    U A1R2U2
# 6  6   1    U A1R2U3
# 7  7   2    A     A1
# 8  8   3    R     R1
# 9  9   3    R     R2

详情

split(x,x)分割字母,如第一组所示:

with(subset(mystart,Set==1),split(Type,Type))
# $A
# [1] "A"
# 
# $R
# [1] "R" "R"
# 
# $U
# [1] "U" "U" "U"

然后fun执行此类操作,由Reduce帮助:

fun(NULL,"A")                        # [1] "A1"
fun("A1",c("R","R"))                 # [1] "A1R1" "A1R2"
fun(c("A1R1","A1R2"),c("U","U","U")) # [1] "A1R2U1" "A1R2U2" "A1R2U3"

奖金解决方案

这个使用rle并避免split的其他基本解决方案为给定示例提供了相同的输出(每当Type值按集合分组时),但不是mystart2 <- rbind(mystart,mystart)

fun2 <- function(x){
  rle_   <- rle(x)
  suffix <- paste0(x,sequence(rle_$length))
  prefix <- unlist(mapply(rep,
                          lag(unlist(
                            Reduce(paste0,paste0(rle_$values,rle_$lengths),accumulate=TRUE)
                          ),rle_$lengths[1]),
                          each=rle_$lengths))
  prefix[is.na(prefix)] <- ""
  paste0(prefix,suffix)
}

mystart$Count2 <-ave(as.character(mystart$Type), mystart$Set,FUN=fun2)