我有一个数据表列表,如下所示:
group1 <- data.table(
group = rep(x = c("group1"), each = 16),
amount = rep(x = 7:4, each = 4),
subgr = rep(x = 1:2, each = 8),
ind = rep(x = 0:1, each = 4, times = 2)
)
group2 <- data.table(
group = rep(x = c("group2"), each = 36),
amount = rep(x = 13:8, each = 6),
subgr = rep(x = 1:3, each = 12),
ind = rep(x = 0:1, each = 6, times = 3)
)
mydt <- rbind(group1, group2)
mydt <- lapply(X = split(x = 1:nrow(mydt), f = mydt[["group"]]),
FUN = function(i)mydt[i])
上面提到的对象过于简单,实际列表包含更多更大的data.table
s,每个在subgr
上分布的行数和{{1}的数量方面略有不同}} 他们自己。我想要实现的是:
subgr
中创建多个列,其数量等于data.table
中唯一值的数量。每个新列都是subgr
的副本。复制列的数量将等于amount
中唯一值的数量。subgr
中新创建的列(如果subgr
则为amount*2
,ind == 1
为amount*4
),其余值保留在ind ==0
subgr
中的子组未受影响。也就是说,有这样的东西(这里只显示mydt$group1
,但它适用于所有表格):
$group1
group amount subgr ind am1 am2
1: group1 7 1 0 28 7
2: group1 7 1 0 28 7
3: group1 7 1 0 28 7
4: group1 7 1 0 28 7
5: group1 6 1 1 12 6
6: group1 6 1 1 12 6
7: group1 6 1 1 12 6
8: group1 6 1 1 12 6
9: group1 5 2 0 5 20
10: group1 5 2 0 5 20
11: group1 5 2 0 5 20
12: group1 5 2 0 5 20
13: group1 4 2 1 4 8
14: group1 4 2 1 4 8
15: group1 4 2 1 4 8
16: group1 4 2 1 4 8
我知道将data.table
拆分为data.table
列表并不是一个好主意,如this post中所述,但这就是对象的方式。除此之外,拆分与我需要执行的任务有关:
subgr
定义的子组,它们的编号在不同的数据表中也有所不同,即新列的数量在整个列表中会有所不同。也就是说,无法一次处理整个data.table
,因为将为group
变量中的每个组创建不同数量的列。
到目前为止我尝试使用this post接受的答案中的第二个解决方案编写函数:
myfun <- function(data, quantity, region, index) {
data <- lapply(data, function(i) {
i[ , eval(paste0("am", unique(i[[region]]))) := i[[quantity]]]
})
data <- lapply(X = data, FUN = function(i) {
rep.names <- paste0("am", unique(i[[region]]))
i[ , eval(rep.names) := lapply(.SD, function(j) {
ifelse(i[["ind"]] == 1L, j*2L, j*4L)
}), by = region, .SDcols = rep.names]
})
return(data)
}
myfun(mydt, quantity = "amount", region = "subgr", index = "ind")
它不能按预期工作,它根据条件修改所有变量中的整个值范围。然而,它会引发警告,这就是问题所在。这是第一个警告,另一个是相同的:
Warning messages:
1: In `[.data.table`(i, , `:=`(eval(rep.names), lapply(.SD, ... :
RHS 1 is length 16 (greater than the size (8) of group 1). The last
8 element(s) will be discarded.
也就是说,它只使用它在LHS上的行,但是然后将整个列用于RHS。显然我在这里缺少一些重要的东西。与[this post] [3]中接受的答案中的第二个解决方案的不同之处在于,有多个列可供使用,而在我的情况下只有一个(amount
)。
有人可以帮忙吗?
答案 0 :(得分:3)
我建议这是一个for循环适合的任务。您可以遍历列表并就地修改每个data.table,而无需重建列表,这是lapply()
所做的。
此外,我建议您在将矩阵分配到目标data.table之前构建矩阵中的am*
列。通过传递amount
作为基础数据向量,我们可以在一次完成中完成所有am*
列的完成,因为大多数单元格直接从amount
列获取其值没有任何变化,特别是如果有许多独特的subgr
值。之后,我们可以通过使用索引矩阵索引分配数据矩阵来选择性地修改必须更改的单元格。构建索引矩阵将相当容易,因为我们知道每行只能更改一个单元格。基本上我们可以cbind()
行索引序列.I
,其中包含从match(subgr,grs)
计算的所需列索引,其中grs
是唯一的subgr
值集合。这比为每个j==i[[region]]
列进行am*
之类的相等比较更有效。
for (i in seq_along(mydt)) {
grs <- unique(mydt[[i]]$subgr);
mydt[[i]][,paste0('am',grs):={
m <- matrix(amount,.N,length(grs));
m[cbind(.I,match(subgr,grs))] <- amount*ifelse(ind==1L,2L,4L);
as.data.frame(m);
}];
}; ## end for
mydt;
## $group1
## group amount subgr ind am1 am2
## 1: group1 7 1 0 28 7
## 2: group1 7 1 0 28 7
## 3: group1 7 1 0 28 7
## 4: group1 7 1 0 28 7
## 5: group1 6 1 1 12 6
## 6: group1 6 1 1 12 6
## 7: group1 6 1 1 12 6
## 8: group1 6 1 1 12 6
## 9: group1 5 2 0 5 20
## 10: group1 5 2 0 5 20
## 11: group1 5 2 0 5 20
## 12: group1 5 2 0 5 20
## 13: group1 4 2 1 4 8
## 14: group1 4 2 1 4 8
## 15: group1 4 2 1 4 8
## 16: group1 4 2 1 4 8
##
## $group2
## group amount subgr ind am1 am2 am3
## 1: group2 13 1 0 52 13 13
## 2: group2 13 1 0 52 13 13
## 3: group2 13 1 0 52 13 13
## 4: group2 13 1 0 52 13 13
## 5: group2 13 1 0 52 13 13
## 6: group2 13 1 0 52 13 13
## 7: group2 12 1 1 24 12 12
## 8: group2 12 1 1 24 12 12
## 9: group2 12 1 1 24 12 12
## 10: group2 12 1 1 24 12 12
## 11: group2 12 1 1 24 12 12
## 12: group2 12 1 1 24 12 12
## 13: group2 11 2 0 11 44 11
## 14: group2 11 2 0 11 44 11
## 15: group2 11 2 0 11 44 11
## 16: group2 11 2 0 11 44 11
## 17: group2 11 2 0 11 44 11
## 18: group2 11 2 0 11 44 11
## 19: group2 10 2 1 10 20 10
## 20: group2 10 2 1 10 20 10
## 21: group2 10 2 1 10 20 10
## 22: group2 10 2 1 10 20 10
## 23: group2 10 2 1 10 20 10
## 24: group2 10 2 1 10 20 10
## 25: group2 9 3 0 9 9 36
## 26: group2 9 3 0 9 9 36
## 27: group2 9 3 0 9 9 36
## 28: group2 9 3 0 9 9 36
## 29: group2 9 3 0 9 9 36
## 30: group2 9 3 0 9 9 36
## 31: group2 8 3 1 8 8 16
## 32: group2 8 3 1 8 8 16
## 33: group2 8 3 1 8 8 16
## 34: group2 8 3 1 8 8 16
## 35: group2 8 3 1 8 8 16
## 36: group2 8 3 1 8 8 16
## group amount subgr ind am1 am2 am3
##
library(microbenchmark);
library(data.table);
hubert <- function(mydt) { myfun <- function(data, quantity, region, index) lapply(data, function(i) i[ , eval(paste0("am", unique(i[[region]]))) := lapply(unique(i[[region]]), function(j) {i[[quantity]]*ifelse(j==i[[region]],ifelse(ind==1, 2, 4), 1)})] ); myfun(mydt, quantity = "amount", region = "subgr", index = "ind"); };
bgoldst <- function(mydt) { for (i in seq_along(mydt)) { grs <- unique(mydt[[i]]$subgr); mydt[[i]][,paste0('am',grs):={ m <- matrix(amount,.N,length(grs)); m[cbind(.I,match(subgr,grs))] <- amount*ifelse(ind==1L,2L,4L); as.data.frame(m); }]; }; mydt; };
## OP's example
group1 <- data.table(group=rep(x=c("group1"),each=16),amount=rep(x=7:4,each=4),subgr=rep(x=1:2,each=8),ind=rep(x=0:1,each=4,times=2));
group2 <- data.table(group=rep(x=c("group2"),each=36),amount=rep(x=13:8,each=6),subgr=rep(x=1:3,each=12),ind=rep(x=0:1,each=6,times=3));
mydt <- rbind(group1,group2);
mydt <- lapply(X=split(x=1:nrow(mydt),f=mydt[["group"]]),FUN=function(i)mydt[i]);
ex <- hubert(lapply(mydt,copy));
all.equal(ex,bgoldst(lapply(mydt,copy)));
## [1] TRUE
microbenchmark(hubert(lapply(mydt,copy)),bgoldst(lapply(mydt,copy)));
## Unit: milliseconds
## expr min lq mean median uq max neval
## hubert(lapply(mydt, copy)) 2.579173 2.632417 2.837445 2.669621 2.736549 6.555914 100
## bgoldst(lapply(mydt, copy)) 2.603977 2.683092 2.880715 2.723078 2.781025 4.376168 100
## scale test
set.seed(1L);
NR <- 1e5L; NGRP <- 1e3L; NAMT <- 30L; NSUBGR <- 30L;
mydt <- data.table(group=paste0('group',sample(NGRP,NR,T)),amount=sample(NAMT,NR,T),subgr=sample(NSUBGR,NR,T),ind=sample(0:1,NR,T));
mydt <- split(mydt,mydt$group);
ex <- hubert(lapply(mydt,copy));
all.equal(ex,bgoldst(lapply(mydt,copy)));
## [1] TRUE
microbenchmark(hubert(lapply(mydt,copy)),bgoldst(lapply(mydt,copy)));
## Unit: seconds
## expr min lq mean median uq max neval
## hubert(lapply(mydt, copy)) 2.831080 2.899419 2.938751 2.935096 2.970701 3.110481 100
## bgoldst(lapply(mydt, copy)) 1.571023 1.647102 1.674666 1.671877 1.709434 1.845174 100
答案 1 :(得分:1)
您的错误来自i[["ind"]]
的长度,其中包含数据集中的所有行,而j
仅包含该组中的行:
ifelse(i[["ind"]] == 1L, j*2L, j*4L)
有几种可能性来解决这个问题并实现目标,这就是我要做的事情:
myfun <- function(data, quantity, region, index) {
lapply(data, function(i) {
i[ , eval(paste0("am", unique(i[[region]]))) := lapply(unique(i[[region]]), function(j)
{i[[quantity]]*ifelse(j==i[[region]],ifelse(ind==1, 2, 4), 1)})]
})
}
myfun(mydt, quantity = "amount", region = "subgr", index = "ind")
$group1
group amount subgr ind am1 am2
1: group1 7 1 0 28 7
2: group1 7 1 0 28 7
3: group1 7 1 0 28 7
4: group1 7 1 0 28 7
5: group1 6 1 1 12 6
6: group1 6 1 1 12 6
7: group1 6 1 1 12 6
8: group1 6 1 1 12 6
9: group1 5 2 0 5 20
10: group1 5 2 0 5 20
11: group1 5 2 0 5 20
12: group1 5 2 0 5 20
13: group1 4 2 1 4 8
14: group1 4 2 1 4 8
15: group1 4 2 1 4 8
16: group1 4 2 1 4 8
$group2
group amount subgr ind am1 am2 am3
1: group2 13 1 0 52 13 13
2: group2 13 1 0 52 13 13
3: group2 13 1 0 52 13 13
4: group2 13 1 0 52 13 13
5: group2 13 1 0 52 13 13
6: group2 13 1 0 52 13 13
7: group2 12 1 1 24 12 12
8: group2 12 1 1 24 12 12
9: group2 12 1 1 24 12 12
10: group2 12 1 1 24 12 12
11: group2 12 1 1 24 12 12
12: group2 12 1 1 24 12 12
13: group2 11 2 0 11 44 11
14: group2 11 2 0 11 44 11
15: group2 11 2 0 11 44 11
16: group2 11 2 0 11 44 11
17: group2 11 2 0 11 44 11
18: group2 11 2 0 11 44 11
19: group2 10 2 1 10 20 10
20: group2 10 2 1 10 20 10
21: group2 10 2 1 10 20 10
22: group2 10 2 1 10 20 10
23: group2 10 2 1 10 20 10
24: group2 10 2 1 10 20 10
25: group2 9 3 0 9 9 36
26: group2 9 3 0 9 9 36
27: group2 9 3 0 9 9 36
28: group2 9 3 0 9 9 36
29: group2 9 3 0 9 9 36
30: group2 9 3 0 9 9 36
31: group2 8 3 1 8 8 16
32: group2 8 3 1 8 8 16
33: group2 8 3 1 8 8 16
34: group2 8 3 1 8 8 16
35: group2 8 3 1 8 8 16
36: group2 8 3 1 8 8 16