require(plyr)
library(reshape)
library(iterators)
library(parallel)
library(foreach)
library(doParallel)
getCosine <- function(x,y)
{
this.cosine <- sum(x*y) / (sqrt(sum(x*x)) * sqrt(sum(y*y)))
return(this.cosine)
}
visitordata <- read.csv("~/Hotels.csv", sep = ",", header = TRUE , stringsAsFactors = FALSE )
visitordata <- subset(visitordata, Product.Views > 0)
head(visitordata)
Visitor_ID Products Product.Views
2 1001863689_3519696751 CZ1XQZ 2
3 1001863689_3519696751 CZR3CN 1
4 1001863689_3519696751 CZTNKN 3
5 121021834007_98749174 CZ2LB0 1
6 11029477426_678878300 CZTNKN 1
7 21029477426_678878300 CZVDHR 1
ColumnBasedData <- reshape(visitordata, idvar="Visitor_ID", timevar="Products", direction="wide")
ColumnBasedData[is.na(ColumnBasedData)] <- 0
x <<- (ColumnBasedData[,!(names(ColumnBasedData) %in% c("Visitor_ID"))])
head(x)
Product.Views.CZ1XQZ Product.Views.CZR3CN Product.Views.CZTNKN Product.Views.CZVDHR Product.Views.CZ36D3 Product.Views.CZE0EN
2 1 1 1 0 0 0
6 0 0 1 1 0 0
9 0 0 0 0 1 1
24 0 0 0 0 0 0
37 0 0 0 0 0 0
40 0 0 0 0 0 0
holder <- matrix(NA, nrow=ncol(x),ncol=ncol(x),dimnames=list(colnames(x),colnames(x)))
dataframe_y <<- as.data.frame(holder)
cl<-makeCluster(detectCores() -1)
doParallel::registerDoParallel(cl)
ls <- foreach(i = 1:ncol(x)) %dopar% {
for(j in 1:ncol(x)) {
dataframe_y[i,j] <- getCosine(x[i],x[j])
}
}
stopCluster(cl)
write.csv(dataframe_y,file="~/cosine.csv")
适用于%do%
,但不适用于%dopar%
。使用%dopar%
,dataframe_y
返回null。有什么想法吗?
编辑:图书馆,功能,数据示例。 我将处理大数据,所以我试图使用并行处理。脚本需要一天以上才能完成而无需并行处理。
答案 0 :(得分:0)
非常感谢所有人。嵌套的Foreach为我工作。看看我在下面有什么变化。
ls <-
foreach(i = 1:ncol(x), .combine = rbind) %:%
foreach(j = 1:ncol(x), .combine=cbind) %dopar% {
dataframe_y[i,j] <- getCosine(x[i],x[j])
}
holder <- matrix(ls, nrow=ncol(x),ncol=ncol(x),dimnames=list(colnames(x),colnames(x)))
dataframe_y <<- as.data.frame(holder)