我使用的数据集结构如下:
library("data.table")
dt <- data.table(Id = c(1,2,3,4,5,6,7,8), "01.01.2005" = c(10,0,0,0,5,7,7,0),
"02.01.2005" = c(4,5,7,0,0,0,9,9), "03.01.2005" = c(2,3,3,6,7,77,45,0),
"04.01.2005" = c(0,0,0,0,0,0,0,1), "05.01.2005" = c(0,1,1,1,1,0,0,2),
"06.01.2005" = c(45,15,45,54,5,76,67,10), "07.01.2005" = c(0,0,0,0,466,21,832,54),
"08.01.2005" = c(2,3,734,2,3,4,5,6))
>dt
Id 01.01.2005 02.01.2005 03.01.2005 04.01.2005 05.01.2005 06.01.2005 07.01.2005 08.01.2005
1: 1 10 4 2 0 0 45 0 2
2: 2 0 5 3 0 1 15 0 3
3: 3 0 7 3 0 1 45 0 734
4: 4 0 0 6 0 1 54 0 2
5: 5 5 0 7 0 1 5 466 3
6: 6 7 0 77 0 0 76 21 4
7: 7 7 9 45 0 0 67 832 5
8: 8 0 9 0 1 2 10 54 6
我需要通过某一步来累加列数。即如果步骤= 2,则列(2,3)也被聚合,(4,5),(6,7)和(8,9)。输出需要如下所示:
>output
Id 01.01.2005-02.01.2005 03.01.2005-04.01.2005 05.01.2005-06.01.2005 07.01.2005-08.01.2005
1: 1 14 2 45 2
2: 2 5 3 16 3
3: 3 7 3 46 734
4: 4 0 6 55 2
5: 5 5 7 6 469
6: 6 7 77 76 25
7: 7 16 45 67 837
8: 8 9 1 12 60
为了达到这个目的,我使用了一个循环:
output <- dt[, list(Id)]
step = 2
for(i in seq(nrow(dt), 2, by = -step)){
output <- cbind(output, temp.col = rowSums(dt[, i:(i-step+1), with = F],
na.rm = FALSE, dims = 1))
setnames(output, "temp.col", "new.name...")
}
但是对于大型数据集,这种方法非常慢。是否存在能够在没有循环的情况下完成所需的功能?
此外:&#34;步骤&#34;需要是一个可变的输入。
提前致谢
答案 0 :(得分:7)
这是另一种可能的方法
step <- 2
temp <- melt(dt, "Id")[, indx := rep(seq_len((ncol(dt)-1L)/step), each = nrow(dt)*step)]
dcast(temp, Id ~ indx, sum, value.var = "value")
# Id 1 2 3 4
# 1: 1 14 2 45 2
# 2: 2 5 3 16 3
# 3: 3 7 3 46 734
# 4: 4 0 6 55 2
# 5: 5 5 7 6 469
# 6: 6 7 77 76 25
# 7: 7 16 45 67 837
# 8: 8 9 1 12 60
答案 1 :(得分:3)
你可以尝试
f1 <- function(DT, step){
m1 <- as.matrix(DT[,-1, with=FALSE])
lst <- lapply(seq(step), function(i) seq(i, ncol(m1), by = step))
nm1 <- do.call(paste, c(lapply(lst, function(x) colnames(m1)[x]), sep="_"))
res <- cbind(Id= DT[["Id"]], as.data.table(Reduce(`+`,
lapply(lst, function(i) m1[,i]))))
setnames(res, 2:ncol(res), nm1)
res}
f1(dt, 2)
# Id 01.01.2005_02.01.2005 03.01.2005_04.01.2005 05.01.2005_06.01.2005
#1: 1 14 2 45
#2: 2 5 3 16
#3: 3 7 3 46
#4: 4 0 6 55
#5: 5 5 7 6
#6: 6 7 77 76
#7: 7 16 45 67
#8: 8 9 1 12
# 07.01.2005_08.01.2005
#1: 2
#2: 3
#3: 734
#4: 2
#5: 469
#6: 25
#7: 837
#8: 60
使用其他“步骤”进行测试
dt1 <- data.table(Id = c(1,2,3,4,5,6,7,8),
"01.01.2005" = c(10,0,0,0,5,7,7,0),
"02.01.2005" = c(4,5,7,0,0,0,9,9), "03.01.2005" = c(2,3,3,6,7,77,45,0),
"04.01.2005" = c(0,0,0,0,0,0,0,1), "05.01.2005" = c(0,1,1,1,1,0,0,2),
"06.01.2005" = c(45,15,45,54,5,76,67,10),
"07.01.2005" = c(0,0,0,0,466,21,832,54),
"08.01.2005" = c(2,3,734,2,3,4,5,6),
"09.01.2005"= c(4, 3, 3, 5, 7, 9, 10, 11))
f1(dt1, 3)
# Id 01.01.2005_02.01.2005_03.01.2005 04.01.2005_05.01.2005_06.01.2005
#1: 1 16 45
#2: 2 8 16
#3: 3 10 46
#4: 4 6 55
#5: 5 12 6
#6: 6 84 76
#7: 7 61 67
#8: 8 9 13
# 07.01.2005_08.01.2005_09.01.2005
#1: 6
#2: 6
#3: 737
#4: 7
#5: 476
#6: 34
#7: 847
#8: 71
另一种选择是使用set
中的data.table
。新函数f2
不依赖于数据集中的列数。如果f1
,step <- 3
将在'dt'失败。对于f2
,如果要用于计算的列数为“8”且步长为“3”,则它将获得第3个rowSums
,rowSums
为4:6,和{7}的rowSums
。在该示例中,尚不清楚OP在这些情况下如何sum
。同样地,对于step <- 5
,我们得到前5列的rowSums
,然后是6:8
f2 <- function(DT, step){
indx <- 2:ncol(DT)
indx1 <- (seq_along(indx)-1)%/%step +1L
lst <- split(indx, indx1)
nm1 <- sapply(lst, function(i) paste(names(DT)[i], collapse="_"))
res <- as.data.table(matrix(0, ncol=length(lst),
nrow=nrow(dt), dimnames=list(NULL, nm1)))
for(j in seq_along(res)){
set(res, i=NULL, j=j, value=DT[, lst[[j]], with=FALSE][, Reduce(`+`, .SD)])
}
cbind(Id=DT[['Id']], res)
}
f2(dt,2)
# Id 01.01.2005_02.01.2005 03.01.2005_04.01.2005 05.01.2005_06.01.2005
#1: 1 14 2 45
#2: 2 5 3 16
#3: 3 7 3 46
#4: 4 0 6 55
#5: 5 5 7 6
#6: 6 7 77 76
#7: 7 16 45 67
#8: 8 9 1 12
# 07.01.2005_08.01.2005
#1: 2
#2: 3
#3: 734
#4: 2
#5: 469
#6: 25
#7: 837
#8: 60
f2(dt,3)
# Id 01.01.2005_02.01.2005_03.01.2005 04.01.2005_05.01.2005_06.01.2005
#1: 1 16 45
#2: 2 8 16
#3: 3 10 46
#4: 4 6 55
#5: 5 12 6
#6: 6 84 76
#7: 7 61 67
#8: 8 9 13
# 07.01.2005_08.01.2005
#1: 2
#2: 3
#3: 734
#4: 2
#5: 469
#6: 25
#7: 837
#8: 60
f2(dt,5)
# Id 01.01.2005_02.01.2005_03.01.2005_04.01.2005_05.01.2005
#1: 1 16
#2: 2 9
#3: 3 11
#4: 4 7
#5: 5 13
#6: 6 84
#7: 7 61
#8: 8 12
# 06.01.2005_07.01.2005_08.01.2005
#1: 47
#2: 18
#3: 779
#4: 56
#5: 474
#6: 101
#7: 904
#8: 70
使用稍大的数据集
set.seed(24)
dt <- as.data.table(matrix(sample(0:9, 5000*5000, replace=TRUE),
ncol=5000))
dt <- cbind(Id=1:5000, dt)
dim(dt)
#[1] 5000 5001
system.time(f2(dt, 4))
# user system elapsed
# 2.269 0.022 2.291
答案 2 :(得分:1)
与David Arenburg的答案基本相同,但可能更具可读性的是首先使用来自.GRP
的优化data.table
运算符获取组号,然后采用模数获得索引的步骤。
step <- 2
temp <- melt(dt, "Id")
temp[, group := .GRP + step-1L, variable]
temp[, indx := group %/% step]
dcast(temp, Id ~ indx, sum, value.var = "value")