Question

我有一个电子邮件列表，我想比较使用最长公共子字符串的行之间的模式（相似性）来比较它们。

数据是一个包含300.000封电子邮件的数据框：

         V1
1   "01003@163.com"
2   "cloud@coldmail.com"
3   "den_smukk_kiilar@hotmail.com"
4   "Esteban.verduzco@gmail.com"
5   "freiheitmensch@gmail.com"
6   "mitsoanastos@yahoo.com"
7   "ahmedsir744@yahoo.com" 
8   ...

我正在使用此代码：

# Compare Strings Difference
compare_strings = function(j,i) {
  value = as.numeric(stringdist(data[j,],data[i,],method='lcs', nthread = 2))   
  pair <- rbind(data[j,], data[i,],value)
  return(pair)
}

i = 0 
kk = 1 

while(kk<nrow(data)) {

  i = i+1 # fix row
  j = c((kk+1):nrow(data)) # rows to be compared 

  # Apply Function "compare_strings" for row "i" with all the others rows  
  out <- as.matrix(t(apply(expand.grid(i,j),1, function(x,y)  compare_strings(x[1],x[2]))))

  kk = kk +1
}

完美无缺！但是我有300.000封电子邮件，我正在尝试并行处理购买流程：

require(parallel)

 clus <- makeCluster(2)

 clusterEvalQ(clus,  compare.strings <- function(j,i) {
    library(stringdist)
    value = as.numeric(stringdist(data[j,],data[i,],method='lcs', nthread = 2))
    pair <- rbind(data[j,], data[i,],value)
    return(pair)
  })

  out = as.matriz(t(parRapply(clus, expand.grid(i,j),function(x,y) compare.strings(x[1],x[2]))))

但我明白了：

Error in checkForRemoteErrors(val) : 
  2 nodes produced errors; first error: object of type 'closure' is not subsettable

我做错了什么？有没有更好的方法来比较字符串的数量？

Answer 1

我们需要使用“clusterExport”导出数据。

#Create cluster

clus <- makeCluster(8)

clusterExport(clus, list("data","stringdist") , envir=environment())

clusterEvalQ(clus,
             compare_strings <- function(j,i) {
               #library(stringdist)
               #print(i)
               value = as.numeric(stringdist(data[j,],data[i,],method='lcs', nthread = 8))
               pair <- rbind(data[j,],data[i,],value)
               return(pair) 
             })

并定义i和j范围，我们可以运行：

#Apply the declared function  

out = matrix(unlist(parRapply(clus,expand.grid(i,j), function(x,y) compare_strings(x[1],x[2]))),ncol=3, byrow = T)

parRapply什么错了？

1 个答案: