分割向量并求和

时间:2018-12-29 16:37:27

标签: r split cut

我是R新手。我有一个向量

vec <- c(105,29,41,70,77,0,56,49,63,0,105)

我想对值求和,直到出现“ 0”,然后创建一个具有这样的值的向量,例如:

vec2 <- c(322,168,105)

但是我真的不知道从哪里开始!有什么建议吗?

5 个答案:

答案 0 :(得分:4)

从此向量开始...

> vec
 [1] 105  29  41  70  77   0  56  49  63   0 105

我们可以计算逻辑零的TRUE / FALSE向量:

> vec == 0
 [1] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE

当您将FALSE和TRUE相加时,FALSE为零,而TRUE为1,因此,如果我们每次将向量加到TRUE时,值就会增加。因此,使用cumsum作为累积总和,我们得到:

> cumsum(vec==0)
 [1] 0 0 0 0 0 1 1 1 1 2 2

现在,该结果定义了我们要在其中添加的组,因此让该结果split vec

> split(vec, cumsum(vec==0))
$`0`
[1] 105  29  41  70  77

$`1`
[1]  0 56 49 63

$`2`
[1]   0 105

因此,除了列表第二部分和后续部分中的零外,这就是我们要累加的数字。因为我们要添加,所以我们可以添加零,并且没有任何区别(但是,如果要使用均值,则必须删除零)。现在,我们使用sapply遍历列表元素并计算总和:

> sapply(split(vec, cumsum(vec==0)),sum)
  0   1   2 
322 168 105 

工作完成。忽略0 1 2标签。

答案 1 :(得分:3)

另一个选项是by

as.numeric(by(vec, cumsum(vec == 0), sum))
#[1] 322 168 105

基准

基于microbenchmark的较大向量的方法的基准比较

# Create sample vector with N entries
set.seed(2018)
N <- 10000
vec <- sample(100, N, replace = T)
vec[sample(length(vec), 100)] <- 0

library(microbenchmark)
res <- microbenchmark(
    vapply = {
        I <- which(vec == 0)
        vapply(1:(length(I)+1),
            function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
            numeric(1))
   },
   by = {
       as.numeric(by(vec, cumsum(vec == 0), sum))
   },
   aggregate = {
       aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
   },
   split = {
       sapply(split(vec, cumsum(vec == 0)), sum)
   },
   Reduce = {
       ans <- numeric(0)
       s <- n <- 0
       Reduce(f = function (y,x) {
           if(x == 0) {
               ans <<- c(ans,s)
               s <<- 0
           }
           n <<- n+1
           s <<- x+s
           if (n == length(vec))
               ans <<- c(ans,s)
           s
       }, vec, init = 0, accumulate = TRUE)
       ans
   },
   for_loop = {
       I   <- which(vec == 0)
       n   <- length(vec)
       N   <- length(I) + 1
       res <- numeric(N)
       for(k in seq_along(res)) {
           if (k == 1) {
               res[k] <- sum(vec[1:I[1]])
               next
           }
           if (k == N) {
               res[k] <- sum(vec[I[N-1]:n])
               next
           }
           res[k] <- sum(vec[I[k-1]:I[k]])
       }
       res
   }
)
res
#    Unit: microseconds
#      expr       min         lq       mean     median         uq       max
#    vapply   435.658   487.4230   621.6155   511.3625   607.2005  6175.039
#        by  3897.401  4187.2825  4721.3168  4436.5850  4936.2900 12365.351
# aggregate  4817.032  5392.0620  6002.2579  5831.2905  6310.3665  9782.524
#     split   611.175   758.4485   895.2201   838.7665   957.0085  1516.556
#    Reduce 21372.054 22169.9110 25363.8684 23022.6920 25503.6145 49255.714
#  for_loop 15172.255 15846.5735 17252.6895 16445.7900 17572.7535 34401.827

library(ggplot2)
autoplot(res)

enter image description here

答案 2 :(得分:2)

aggregate函数对于这种事情很有用。使用cumsum创建分组变量(类似于@Spacedman的解释)。使用sum函数作为聚合操作。最后的[[2]]只是从aggregate返回的内容中提取您想要的内容:

aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]

[1] 322 168 105

答案 3 :(得分:2)

使用vapply

这里是vapply

的一个选项
I <- which(vec == 0)
vapply(1:(length(I)+1), 
       function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]), 
       numeric(1))
# [1] 322 168 105

使用Reduce

这是使用Reduce

的解决方案
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
            if(x == 0) {
              ans <<- c(ans,s)
              s <<- 0
            }
            n <<- n+1
            s <<- x+s
            if(n == length(vec))
              ans <<- c(ans,s)
            s
       }, vec, init = 0, accumulate = TRUE)
ans
# [1] 322 168 105

带有循环

或者也许是老式的循环

I   <- which(vec == 0)
n   <- length(vec)
N   <- length(I) + 1
res <- numeric(N)
for(k in seq_along(res)) {
  if (k == 1) {
    res[k] <- sum(vec[1:I[1]])
    next
  }
  if (k == N) {
    res[k] <- sum(vec[I[N-1]:n])
    next
  }
  res[k] <- sum(vec[I[k-1]:I[k]])
}
res
# [1] 322 168 105

基准化

数据

以下是用于基准测试的数据

# c.f. @MauritsEvers
# Create sample vector with N entries
set.seed(2018)
N <- 10000
vec <- sample(100, N, replace = T)
vec[sample(length(vec), 100)] <- 0

功能

以下是第二个基准数据的功能:

reduce <- function(vec) {
  ans <- numeric(0)
  s <- n <- 0
  Reduce(f = function (y,x) {
    if(x == 0) {
      ans <<- c(ans,s)
      s <<- 0
    }
    n <<- n+1
    s <<- x+s
    if(n == length(vec))
     ans <<- c(ans,s)
     s 
  }, vec, init = 0, accumulate = TRUE)
  ans
}
Vapply <- function (vec) {
  I <- which(vec == 0)
  vapply(1:(length(I)+1), 
         function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]), 
         numeric(1))
}
By <- function (vec) as.numeric(by(vec, cumsum(vec == 0), sum))
Split <- function (vec) sapply(split(vec, cumsum(vec==0)),sum)
Aggregate <- function (vec) aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
for_loop <- function(vec) {
  I <- which(vec == 0)
  n <- length(vec)
  N <- length(I)+1
  res <- numeric(N)
  for(k in seq_along(res)) {
    if (k == 1) {
      res[k] <- sum(vec[1:I[1]])
      next
    }
    if (k == N) {
      res[k] <- sum(vec[I[N-1]:n])
      next
    }
    res[k] <- sum(vec[I[k-1]:I[k]])
  }
  res
}
Rowsum <- function (vec) rowsum(vec, cumsum(vec == 0))

基准化

以下是两个基准测试过程的组合:

# c.f. @MauritsEvers
resBoth <- microbenchmark::microbenchmark(
  Vapply = {
    I <- which(vec == 0)
    vapply(1:(length(I)+1),
           function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
           numeric(1))
  },
  Vapply(vec),
  By = {
    as.numeric(by(vec, cumsum(vec == 0), sum))
  },
  By(vec),
  Aggregate = {
    aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
  },
  Aggregate(vec),
  Split = {
    sapply(split(vec, cumsum(vec == 0)), sum)
  },
  Split(vec),
  reduce = {
    ans <- numeric(0)
    s <- n <- 0
    Reduce(f = function (y,x) {
      if(x == 0) {
        ans <<- c(ans,s)
        s <<- 0
      }
      n <<- n+1
      s <<- x+s
      if (n == length(vec))
        ans <<- c(ans,s)
      s
    }, vec, init = 0, accumulate = TRUE)
    ans
  },
  reduce(vec),
  for_loop = {
    I   <- which(vec == 0)
    n   <- length(vec)
    N   <- length(I) + 1
    res <- numeric(N)
    for(k in seq_along(res)) {
      if (k == 1) {
        res[k] <- sum(vec[1:I[1]])
        next
      }
      if (k == N) {
        res[k] <- sum(vec[I[N-1]:n])
        next
      }
      res[k] <- sum(vec[I[k-1]:I[k]])
    }
    res
  },
  for_loop(vec),
  Rowsum = {rowsum(vec, cumsum(vec == 0))},
  Rowsum(vec),
  times = 10^3
 )

结果

这是基准测试结果

resBoth
# Unit: microseconds
#           expr       min         lq       mean     median         uq       max neval     cld
#         Vapply   234.121   281.5280   358.0708   311.7955   343.5215  4775.018  1000 ab     
#    Vapply(vec)   234.850   278.6100   376.3956   306.3260   334.4050 14564.278  1000 ab     
#             By  1866.029  2108.7175  2468.1208  2209.0025  2370.5520 23316.045  1000   c    
#        By(vec)  1870.769  2120.5695  2473.1643  2217.3900  2390.6090 21039.762  1000   c    
#      Aggregate  2738.324  3015.6570  3298.0863  3117.9480  3313.2295 13328.404  1000    d   
# Aggregate(vec)  2733.583  2998.1530  3295.6874  3109.1955  3349.1500  8277.694  1000    d   
#          Split   359.202   412.0800   478.0553   444.1710   492.3080  4622.220  1000  b     
#     Split(vec)   366.131   410.4395   475.2633   444.1715   490.3025  4601.799  1000  b     
#         reduce 10862.491 13062.3755 15353.2826 14465.0870 16559.3990 76305.463  1000       g
#    reduce(vec) 10403.004 12448.9965 14658.4035 13825.9995 15893.3255 67337.080  1000      f 
#       for_loop  6687.724  7429.4670  8518.0470  7818.0250  9023.9955 27541.136  1000     e  
#  for_loop(vec)   123.624   145.8690   187.2201   157.5390   177.4140  9928.200  1000 a      
#         Rowsum   235.579   264.3880   305.7516   282.2570   322.7360   792.068  1000 ab     
#    Rowsum(vec)   239.590   264.9350   307.2508   284.8100   322.0060  1778.143  1000 ab  

答案 4 :(得分:1)

rowsum()相当快。我们可以使用cumsum(vec == 0)进行分组。

c(rowsum(vec, cumsum(vec == 0)))
# [1] 322 168 105