我正在寻找一种更快的方式来实现下面的操作。数据集包含> 1M行,但我提供了一个简化的例子来说明任务 -
To create the data table --
dt <- data.table(name=c("john","jill"), a1=c(1,4), a2=c(2,5), a3=c(3,6),
b1=c(10,40), b2=c(20,50), b3=c(30,60))
colGroups <- c("a","b") # Columns starting in "a", and in "b"
Original Dataset
-----------------------------------
name a1 a2 a3 b1 b2 b3
john 1 2 3 10 20 30
jill 4 5 6 40 50 60
对上面的数据集进行转换,为每个唯一名称添加2个新行,并且在每一行中,每个列的列都独立左移(在此示例中,我使用了列和b列但是有更多)
Transformed Dataset
-----------------------------------
name a1 a2 a3 b1 b2 b3
john 1 2 3 10 20 30 # First Row for John
john 2 3 0 20 30 0 # "a" values left shifted, "b" values left shifted
john 3 0 0 30 0 0 # Same as above, left-shifted again
jill 4 5 6 40 50 60 # Repeated for Jill
jill 5 6 0 50 60 0
jill 6 0 0 60 0 0
等等。我的数据集非常大,这就是我试图查看是否有一种有效的方法来实现它的原因。
提前致谢。
答案 0 :(得分:5)
更新:一个(更快)解决方案是使用索引进行如下操作(在1e6 * 7上大约需要4秒):
ll <- vector("list", 3)
ll[[1]] <- copy(dt[, -1, with=FALSE])
d_idx <- seq(2, ncol(dt), by=3)
for (j in 1:2) {
tmp <- vector("list", 2)
for (i in seq_along(colGroups)) {
idx <- ((i-1)*3+2):((i*3)+1)
tmp[[i]] <- cbind(dt[, setdiff(idx, d_idx[i]:(d_idx[i]+j-1)),
with=FALSE], data.table(matrix(0, ncol=j)))
}
ll[[j+1]] <- do.call(cbind, tmp)
}
ans <- cbind(data.table(name=dt$name), rbindlist(ll))
setkey(ans, name)
首次尝试(旧):
非常有趣的问题。我使用melt.data.table
和dcast.data.table
(来自1.8.11)进行处理,如下所示:
require(data.table)
require(reshape2)
# melt is S3 generic, calls melt.data.table, returns a data.table (very fast)
ans <- melt(dt, id=1, measure=2:7, variable.factor=FALSE)[,
grp := rep(colGroups, each=nrow(dt)*3)]
setkey(ans, name, grp)
ans <- ans[, list(variable=c(variable, variable[1:(.N-1)],
variable[1:(.N-2)]), value=c(value, value[-1],
value[-(1:2)]), id2=rep.int(1:3, 3:1)), list(name, grp)]
# dcast in reshape2 is not yet a S3 generic, have to call by full name
ans <- dcast.data.table(ans, name+id2~variable, fill=0L)[, id2 := NULL]
对具有相同列数的1e6行进行基准测试:
require(data.table)
require(reshape2)
set.seed(45)
N <- 1e6
dt <- cbind(data.table(name=paste("x", 1:N, sep="")),
matrix(sample(10, 6*N, TRUE), nrow=N))
setnames(dt, c("name", "a1", "a2", "a3", "b1", "b2", "b3"))
colGroups = c("a", "b")
system.time({
ans <- melt(dt, id=1, measure=2:7, variable.factor=FALSE)[,
grp := rep(colGroups, each=nrow(dt)*3)]
setkey(ans, name, grp)
ans <- ans[, list(variable=c(variable, variable[1:(.N-1)],
variable[1:(.N-2)]), value=c(value, value[-1],
value[-(1:2)]), id2=rep.int(1:3, 3:1)), list(name, grp)]
ans <- dcast.data.table(ans, name+id2~variable, fill=0L)[, id2 := NULL]
})
# user system elapsed
# 45.627 2.197 52.051
答案 1 :(得分:1)
您可以追加行,然后按列向上移动列。 由于每组的总列数是固定的,因此您将遍历每个组编号。
## Add in the extra rows
dt <- dt[, rbindlist(rep(list(.SD), 3)), by=name]
### ASSUMING A FIXED NUMBER PER COLGROUP
N <- 3
colsShifting <- as.vector(sapply(colGroups, paste0, 2:N))
for (i in (2:N)-1 ) {
current <- colsShifting[ (i) + ( (N-1) * (seq_along(colGroups)-1) )]
dt[, c(current) := {
.NN <- .N;
.CROP <- .SD[1:(.NN-i)] ## These lines are only for clean code. You can put it all into the `rbindlist` line
rbindlist(list(.CROP, as.data.table(replicate(ncol(.SD), rep(0, i),simplify=FALSE ))))
}
, .SDcols=current
, by=name]
}
给出:
dt
# name a1 a2 a3 b1 b2 b3
# 1: john 1 2 3 10 20 30
# 2: john 1 2 0 10 20 0
# 3: john 1 0 0 10 0 0
# 4: jill 4 5 6 40 50 60
# 5: jill 4 5 0 40 50 0
# 6: jill 4 0 0 40 0 0
答案 2 :(得分:1)
只需编辑所选答案的@Arun(s)代码。在此提供,因为我无法在评论部分发布。
#Parameterized version of @Arun (author) code (in the selected answer)
#Shifting Columns in R
#--------------------------------------------
N = 5 # SET - Number of unique names
set.seed(5)
colGroups <- c("a","b") # ... (i) # SET colGroups
totalColsPerGroup <- 10 # SET Cols Per Group
numColsToLeftShift <- 8 # SET Cols to Shift
lenColGroups <- length(colGroups) # ... (ii)
# From (i) and (ii)
totalCols = lenColGroups * totalColsPerGroup
dt <- cbind(data.table(name=paste("x", 1:N, sep="")),
matrix(sample(5, totalCols*N, TRUE), nrow=N)) # Change 5 if needed
ll <- vector("list", numColsToLeftShift)
ll[[1]] <- copy(dt[, -1, with=FALSE])
d_idx <- seq(2, ncol(dt), by=totalColsPerGroup)
for (j in 1:(numColsToLeftShift)) {
tmp <- vector("list", 2)
for (i in seq_along(colGroups)) {
idx <- ((i-1)*totalColsPerGroup+2):((i*totalColsPerGroup)+1) #OK
tmp[[i]] <- cbind(dt[, setdiff(idx, d_idx[i]:(d_idx[i]+j-1)),
with=FALSE], data.table(matrix(0, ncol=j)))
}
ll[[j+1]] <- do.call(cbind, tmp)
}
ans <- cbind(data.table(name=dt$name), rbindlist(ll))
setkey(ans, name)
-