我是R新手。我有一个向量
vec <- c(105,29,41,70,77,0,56,49,63,0,105)
我想对值求和,直到出现“ 0”,然后创建一个具有这样的值的向量,例如:
vec2 <- c(322,168,105)
但是我真的不知道从哪里开始!有什么建议吗?
答案 0 :(得分:4)
从此向量开始...
> vec
[1] 105 29 41 70 77 0 56 49 63 0 105
我们可以计算逻辑零的TRUE / FALSE向量:
> vec == 0
[1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
当您将FALSE和TRUE相加时,FALSE为零,而TRUE为1,因此,如果我们每次将向量加到TRUE时,值就会增加。因此,使用cumsum
作为累积总和,我们得到:
> cumsum(vec==0)
[1] 0 0 0 0 0 1 1 1 1 2 2
现在,该结果定义了我们要在其中添加的组,因此让该结果split
vec
:
> split(vec, cumsum(vec==0))
$`0`
[1] 105 29 41 70 77
$`1`
[1] 0 56 49 63
$`2`
[1] 0 105
因此,除了列表第二部分和后续部分中的零外,这就是我们要累加的数字。因为我们要添加,所以我们可以添加零,并且没有任何区别(但是,如果要使用均值,则必须删除零)。现在,我们使用sapply
遍历列表元素并计算总和:
> sapply(split(vec, cumsum(vec==0)),sum)
0 1 2
322 168 105
工作完成。忽略0 1 2
标签。
答案 1 :(得分:3)
另一个选项是by
as.numeric(by(vec, cumsum(vec == 0), sum))
#[1] 322 168 105
基于microbenchmark
的较大向量的方法的基准比较
# Create sample vector with N entries
set.seed(2018)
N <- 10000
vec <- sample(100, N, replace = T)
vec[sample(length(vec), 100)] <- 0
library(microbenchmark)
res <- microbenchmark(
vapply = {
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
},
by = {
as.numeric(by(vec, cumsum(vec == 0), sum))
},
aggregate = {
aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
},
split = {
sapply(split(vec, cumsum(vec == 0)), sum)
},
Reduce = {
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if (n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
},
for_loop = {
I <- which(vec == 0)
n <- length(vec)
N <- length(I) + 1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
}
)
res
# Unit: microseconds
# expr min lq mean median uq max
# vapply 435.658 487.4230 621.6155 511.3625 607.2005 6175.039
# by 3897.401 4187.2825 4721.3168 4436.5850 4936.2900 12365.351
# aggregate 4817.032 5392.0620 6002.2579 5831.2905 6310.3665 9782.524
# split 611.175 758.4485 895.2201 838.7665 957.0085 1516.556
# Reduce 21372.054 22169.9110 25363.8684 23022.6920 25503.6145 49255.714
# for_loop 15172.255 15846.5735 17252.6895 16445.7900 17572.7535 34401.827
library(ggplot2)
autoplot(res)
答案 2 :(得分:2)
aggregate
函数对于这种事情很有用。使用cumsum
创建分组变量(类似于@Spacedman的解释)。使用sum
函数作为聚合操作。最后的[[2]]
只是从aggregate
返回的内容中提取您想要的内容:
aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
[1] 322 168 105
答案 3 :(得分:2)
这里是vapply
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
# [1] 322 168 105
这是使用Reduce
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if(n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
# [1] 322 168 105
或者也许是老式的循环
I <- which(vec == 0)
n <- length(vec)
N <- length(I) + 1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
# [1] 322 168 105
数据
以下是用于基准测试的数据
# c.f. @MauritsEvers
# Create sample vector with N entries
set.seed(2018)
N <- 10000
vec <- sample(100, N, replace = T)
vec[sample(length(vec), 100)] <- 0
功能
以下是第二个基准数据的功能:
reduce <- function(vec) {
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if(n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
}
Vapply <- function (vec) {
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
}
By <- function (vec) as.numeric(by(vec, cumsum(vec == 0), sum))
Split <- function (vec) sapply(split(vec, cumsum(vec==0)),sum)
Aggregate <- function (vec) aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
for_loop <- function(vec) {
I <- which(vec == 0)
n <- length(vec)
N <- length(I)+1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
}
Rowsum <- function (vec) rowsum(vec, cumsum(vec == 0))
基准化
以下是两个基准测试过程的组合:
# c.f. @MauritsEvers
resBoth <- microbenchmark::microbenchmark(
Vapply = {
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
},
Vapply(vec),
By = {
as.numeric(by(vec, cumsum(vec == 0), sum))
},
By(vec),
Aggregate = {
aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
},
Aggregate(vec),
Split = {
sapply(split(vec, cumsum(vec == 0)), sum)
},
Split(vec),
reduce = {
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if (n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
},
reduce(vec),
for_loop = {
I <- which(vec == 0)
n <- length(vec)
N <- length(I) + 1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
},
for_loop(vec),
Rowsum = {rowsum(vec, cumsum(vec == 0))},
Rowsum(vec),
times = 10^3
)
结果
这是基准测试结果
resBoth
# Unit: microseconds
# expr min lq mean median uq max neval cld
# Vapply 234.121 281.5280 358.0708 311.7955 343.5215 4775.018 1000 ab
# Vapply(vec) 234.850 278.6100 376.3956 306.3260 334.4050 14564.278 1000 ab
# By 1866.029 2108.7175 2468.1208 2209.0025 2370.5520 23316.045 1000 c
# By(vec) 1870.769 2120.5695 2473.1643 2217.3900 2390.6090 21039.762 1000 c
# Aggregate 2738.324 3015.6570 3298.0863 3117.9480 3313.2295 13328.404 1000 d
# Aggregate(vec) 2733.583 2998.1530 3295.6874 3109.1955 3349.1500 8277.694 1000 d
# Split 359.202 412.0800 478.0553 444.1710 492.3080 4622.220 1000 b
# Split(vec) 366.131 410.4395 475.2633 444.1715 490.3025 4601.799 1000 b
# reduce 10862.491 13062.3755 15353.2826 14465.0870 16559.3990 76305.463 1000 g
# reduce(vec) 10403.004 12448.9965 14658.4035 13825.9995 15893.3255 67337.080 1000 f
# for_loop 6687.724 7429.4670 8518.0470 7818.0250 9023.9955 27541.136 1000 e
# for_loop(vec) 123.624 145.8690 187.2201 157.5390 177.4140 9928.200 1000 a
# Rowsum 235.579 264.3880 305.7516 282.2570 322.7360 792.068 1000 ab
# Rowsum(vec) 239.590 264.9350 307.2508 284.8100 322.0060 1778.143 1000 ab
答案 4 :(得分:1)
rowsum()
相当快。我们可以使用cumsum(vec == 0)
进行分组。
c(rowsum(vec, cumsum(vec == 0)))
# [1] 322 168 105