Question

我有一系列R数据帧（数千）。每个都有一个分类变量（productId）和一个连续变量（sales）。我还有一个我创建的距离函数（my_distance），它计算同一数据框中两个productID之间的距离。由于每个数据框中有数百个产品ID和数千个数据框，因此我想探索使用Hadoop加速该过程的机会。现在我正在使用for循环迭代所有数据帧，并使用mcmapply来计算给定数据帧中productID之间的所有距离。我想知道是否可以在Hadoop中完成这一点，以便在我的集群节点上利用并行计算。不要注意距离函数的内容，因为它只是一个例子。

library(parallel)
library(reshape2)

calcDist <- function(x1, x2) {
  return(sqrt(sum(x1^2-x2^2)))
}

my_distance <- function(df, id1, id2) {
  x1 <- df[df$productId==id1,c('sales')]  
  x2 <- df[df$productId==id2,c('sales')]
  distx <- calcDist(x1, x2)
  return(distx)
}

productId <- c(1,1,1,1,2,2,2,2,3,3,3,3)
sales <- runif(length(productId), min=0, max=100)

df <-data.frame(productId,sales)


...mcmapply()

Answer 1

这是一个可行的解决方案，请注意您的函数有时会返回NaN，我没有调查过，因为您的问题似乎更倾向于整个过程。

我使用localhost作为工作人员，您只需要将hostNumbers替换为群集上节点名称的字符向量，或者初始化群集，但您已经这样做了。只要你用它来代替devClust，mapply就会起作用。

# define supporting data structures 
productId <- rep(c(1, 2, 3), each = 4)
sales.gen <- function() runif(length(productId), min = 0, max = 100)
df.gen <- function(x) data.frame(productId, sales = sales.gen())
dfList <- lapply(as.list(1:10), df.gen)

library(parallel)
on.exit(stopCluster(devClust))

# here you should use a vector of your nodes' names
hostNumbers <- c("localhost")

# builds a list structure of host names, one per node
hostFrame <- lapply(hostNumbers, function(x) list(host = x));

# replicates each node `kCPUs` times, so number of cpus on each node is equal
# NOTE: total number of workers cannot exceed 128 in base R!
kCPUs <- 2 
hostList <- rep(hostFrame, kCPUs);

# initialize the socket cluster, outfile = "" specifies that individual cpu commands & logs be printed to stdout, w
# which will be our log file for R.
devClust <- makePSOCKcluster(hostList)

# define functions
calcDist <- function(x1, x2) {
  return(sqrt(sum(x1^2-x2^2)))
}

my_distance <- function(df, id1, id2) {
  x1 <- df[df$productId==id1,c('sales')]  
  x2 <- df[df$productId==id2,c('sales')]
  distx <- calcDist(x1, x2)
  return(distx)
}

# export function definitions to cluster
functionDefs <- Filter(function(x) is.function(get(x, .GlobalEnv)), ls(.GlobalEnv))
clusterExport(devClust, functionDefs)

# run distance calcs. Note that we quote the function because it has 
# already been exported to the workers so there is no need to serialize again
list.of.distance.calcs <- clusterMap(devClust, 'my_distance', dfList, MoreArgs = list(id1 = 1, id2 = 2))

在Hadoop上运行R distance函数

1 个答案: