跟进一些data.table
并行度(1) (2) (3)我试图解决这个问题。 这种语法有什么问题?
library(data.table)
set.seed(1234)
dt <- data.table(id= factor(sample(1L:10000L, size= 1e6, replace= TRUE)),
val= rnorm(n= 1e6), key="id")
foo <- function(l) sum(l)
dt2 <- dt[, foo(.SD), by= "id"]
library(parallel)
cl <- makeCluster(detectCores())
dt3 <- clusterApply(cl, x= parallel:::splitRows(dt, detectCores()),
fun=lapply, FUN= function(x,foo) {
x[, foo(data.table:::".SD"), by= "id"]
}, foo= foo)
stopCluster(cl)
# note that library(parallel) is annoying and you often have to do this type ("::", ":::") of exporting to the parallel package
checkForRemoteErrors(val)出错: 4个节点产生错误;第一个错误:维数不正确
cl <- makeCluster(detectCores())
dt3 <- clusterApply(cl, x= parallel:::splitRows(dt, detectCores()),
fun=lapply, FUN= function(x,foo) {
x <- data.table::data.table(x)
x[, foo(data.table:::".SD"), by= "id"]
}, foo= foo)
stopCluster(cl)
checkForRemoteErrors(val)出错: 4个节点产生错误;第一个错误:对象&#39; id&#39;找不到
我已经玩过很多语法了。这两个似乎是我能得到的最接近的。显然,某些事情仍然不对。
我的真正问题是结构相似但有更多行,我使用的是24核/ 48逻辑处理器的机器。因此,看着我的计算机使用大约4%的计算能力(仅使用1个核心)真的很烦人
答案 0 :(得分:2)
您可能希望评估Rserve解决方案的并行性。
请参阅以下示例使用本地并行的2个R节点构建Rserve。它也可以分布在远程实例上。
class City
会话信息:
library(data.table)
set.seed(1234)
dt <- data.table(id= factor(sample(1L:10000L, size= 1e6, replace= TRUE)),
val= rnorm(n= 1e6), key="id")
foo <- function(l) sum(l)
library(big.data.table)
# start 2 R instances
library(Rserve)
port = 6311:6312
invisible(sapply(port, function(port) Rserve(debug = FALSE, port = port, args = c("--no-save"))))
# client side
rscl = rscl.connect(port = port, pkgs = "data.table") # connect and auto require packages
bdt = as.big.data.table(dt, rscl) # create big.data.table from local data.table and list of connections to R nodes
rscl.assign(rscl, "foo", foo) # assign `foo` function to nodes
bdt[, foo(.SD), by="id"][, foo(.SD), by="id"] # first query is run remotely, second locally
# id V1
# 1: 1 10.328998
# 2: 2 -8.448441
# 3: 3 21.475910
# 4: 4 -5.302411
# 5: 5 -11.929699
# ---
# 9996: 9996 -4.905192
# 9997: 9997 -4.293194
# 9998: 9998 -2.387100
# 9999: 9999 16.530731
#10000: 10000 -15.390543
# optionally with special care
# bdt[, foo(.SD), by= "id", outer.aggregate = TRUE]