Question

我试图并行使用openNLP / NLP包中的词性标注。我需要代码在任何操作系统上工作，所以我选择并行使用parLapply函数（但是对其他独立于操作系统的选项是开放的）。过去，我在tagPOS的openNLP包中运行了parLapply函数，没有任何问题。但是，openNLP包有一些最近的更改，消除了tagPOS并添加了一些更灵活的选项。 Kurt非常友好地帮助我从新软件包的工具中重新创建tagPOS函数。我可以使lapply版本工作，但不能使用并行版本。它一直说节点需要传递给它们的更多变量，直到它最终要求openNLP中的非导出函数。这似乎很奇怪，它会不断要求传递越来越多的变量，告诉我我正在设置parLapply错误。如何设置tagPOS以独立于操作系统的并行方式运行？

library(openNLP)
library(NLP)
library(parallel)

## POS tagger
tagPOS <-  function(x, pos_tag_annotator, ...) {
    s <- as.String(x)
    ## Need sentence and word token annotations.
    word_token_annotator <- Maxent_Word_Token_Annotator()
    a2 <- Annotation(1L, "sentence", 1L, nchar(s))
    a2 <- annotate(s, word_token_annotator, a2)
    a3 <- annotate(s, pos_tag_annotator, a2)

    ## Determine the distribution of POS tags for word tokens.
    a3w <- a3[a3$type == "word"]
    POStags <- unlist(lapply(a3w$features, `[[`, "POS"))

    ## Extract token/POS pairs (all of them): easy.
    POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
    list(POStagged = POStagged, POStags = POStags)
} ## End of tagPOS function 

## Set up a parallel run
text.var <- c("I like it.", "This is outstanding soup!",  
    "I really must get the recipe.")
ntv <- length(text.var)
PTA <- Maxent_POS_Tag_Annotator()   

cl <- makeCluster(mc <- getOption("cl.cores", detectCores()/2))
clusterExport(cl=cl, varlist=c("text.var", "ntv", 
    "tagPOS", "PTA", "as.String", "Maxent_Word_Token_Annotator"), 
    envir = environment())
m <- parLapply(cl, seq_len(ntv), function(i) {
        x <- tagPOS(text.var[i], PTA)
        return(x)
    }
)
stopCluster(cl)

## Error in checkForRemoteErrors(val) : 
##   3 nodes produced errors; first error: could not find function 
##   "Maxent_Simple_Word_Tokenizer"

openNLP::Maxent_Simple_Word_Tokenizer

## >openNLP::Maxent_Simple_Word_Tokenizer
## Error: 'Maxent_Simple_Word_Tokenizer' is not an exported 
##     object from 'namespace:openNLP'

## It's a non exported function
openNLP:::Maxent_Simple_Word_Tokenizer


## Demo that it works with lapply
lapply(seq_len(ntv), function(i) {
    tagPOS(text.var[i], PTA)
})

lapply(text.var, function(x) {
    tagPOS(x, PTA)
})

## >     lapply(seq_len(ntv), function(i) {
## +         tagPOS(text.var[i], PTA)
## +     })
## [[1]]
## [[1]]$POStagged
## [1] "I/PRP like/IN it/PRP ./."
## 
## [[1]]$POStags
## [1] "PRP" "IN"  "PRP" "."  
## 
## [[1]]$word.count
## [1] 3
## 
## 
## [[2]]
## [[2]]$POStagged
## [1] "THis/DT is/VBZ outstanding/JJ soup/NN !/."
## 
## [[2]]$POStags
## [1] "DT"  "VBZ" "JJ"  "NN"  "."  
## 
## [[2]]$word.count
## [1] 4
## 
## 
## [[3]]
## [[3]]$POStagged
## [1] "I/PRP really/RB must/MD get/VB the/DT recip/NN ./."
## 
## [[3]]$POStags
## [1] "PRP" "RB"  "MD"  "VB"  "DT"  "NN"  "."  
## 
## [[3]]$word.count
## [1] 6

编辑：根据史蒂夫的建议

请注意openNLP是全新的。我从CRAN的tar.gz安装了ver 2.1。即使存在此函数，我也会收到以下错误。

library(openNLP); library(NLP); library(parallel)

tagPOS <-  function(text.var, pos_tag_annotator, ...) {
    s <- as.String(text.var)

    ## Set up the POS annotator if missing (for parallel)
    if (missing(pos_tag_annotator)) {
        PTA <- Maxent_POS_Tag_Annotator()
    }

    ## Need sentence and word token annotations.
    word_token_annotator <- Maxent_Word_Token_Annotator()
    a2 <- Annotation(1L, "sentence", 1L, nchar(s))
    a2 <- annotate(s, word_token_annotator, a2)
    a3 <- annotate(s, PTA, a2)

    ## Determine the distribution of POS tags for word tokens.
    a3w <- a3[a3$type == "word"]
    POStags <- unlist(lapply(a3w$features, "[[", "POS"))

    ## Extract token/POS pairs (all of them): easy.
    POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
    list(POStagged = POStagged, POStags = POStags)
}

text.var <- c("I like it.", "This is outstanding soup!",  
    "I really must get the recipe.")

cl <- makeCluster(mc <- getOption("cl.cores", detectCores()/2))
clusterEvalQ(cl, {library(openNLP); library(NLP)})
m <- parLapply(cl, text.var, tagPOS)

## > m <- parLapply(cl, text.var, tagPOS)
## Error in checkForRemoteErrors(val) : 
##   3 nodes produced errors; first error: could not find function "Maxent_POS_Tag_Annotator"

stopCluster(cl)


> packageDescription('openNLP')
Package: openNLP
Encoding: UTF-8
Version: 0.2-1
Title: Apache OpenNLP Tools Interface
Authors@R: person("Kurt", "Hornik", role = c("aut", "cre"), email =
          "Kurt.Hornik@R-project.org")
Description: An interface to the Apache OpenNLP tools (version 1.5.3).  The Apache OpenNLP
          library is a machine learning based toolkit for the processing of natural language
          text written in Java.  It supports the most common NLP tasks, such as tokenization,
          sentence segmentation, part-of-speech tagging, named entity extraction, chunking,
          parsing, and coreference resolution.  See http://opennlp.apache.org/ for more
          information.
Imports: NLP (>= 0.1-0), openNLPdata (>= 1.5.3-1), rJava (>= 0.6-3)
SystemRequirements: Java (>= 5.0)
License: GPL-3
Packaged: 2013-08-20 13:23:54 UTC; hornik
Author: Kurt Hornik [aut, cre]
Maintainer: Kurt Hornik <Kurt.Hornik@R-project.org>
NeedsCompilation: no
Repository: CRAN
Date/Publication: 2013-08-20 15:41:22
Built: R 3.0.1; ; 2013-08-20 13:48:47 UTC; windows

Answer 1

由于您在群集工作者上调用NLP的函数，因此应在调用parLapply之前将其加载到每个worker上。您可以从worker函数执行此操作，但在创建集群对象后，我倾向于使用clusterCall或clusterEvalQ：

clusterEvalQ(cl, {library(openNLP); library(NLP)})

由于as.String和Maxent_Word_Token_Annotator位于这些包中，因此不应将其导出。

请注意，在我的计算机上运行示例时，我注意到PTA对象在导出到工作计算机后不起作用。据推测，该对象中有一些东西无法安全地序列化和反序列化。使用clusterEvalQ在工作人员上创建该对象后，该示例成功运行。在这里，使用openNLP 0.2-1：

library(parallel)
tagPOS <-  function(x, ...) {
    s <- as.String(x)
    word_token_annotator <- Maxent_Word_Token_Annotator()
    a2 <- Annotation(1L, "sentence", 1L, nchar(s))
    a2 <- annotate(s, word_token_annotator, a2)
    a3 <- annotate(s, PTA, a2)
    a3w <- a3[a3$type == "word"]
    POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
    POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
    list(POStagged = POStagged, POStags = POStags)
}
text.var <- c("I like it.", "This is outstanding soup!",
    "I really must get the recipe.")
cl <- makeCluster(mc <- getOption("cl.cores", detectCores()/2))
clusterEvalQ(cl, {
    library(openNLP)
    library(NLP)
    PTA <- Maxent_POS_Tag_Annotator()
})
m <- parLapply(cl, text.var, tagPOS)
print(m)
stopCluster(cl)

如果找不到Maxent_POS_Tag_Annotator导致clusterEvalQ失败，您可能会在worker上加载错误版本的openNLP。您可以通过sessionInfo执行clusterEvalQ来确定您在工作人员上获得的软件包版本：

library(parallel)
cl <- makeCluster(2)
clusterEvalQ(cl, {library(openNLP); library(NLP)})
clusterEvalQ(cl, sessionInfo())

这将返回在每个集群工作程序上执行sessionInfo()的结果。以下是我正在使用的一些软件包的版本信息，这对我有用：

other attached packages:
[1] NLP_0.1-0     openNLP_0.2-1

loaded via a namespace (and not attached):
[1] openNLPdata_1.5.3-1 rJava_0.9-4

并行parLapply设置

1 个答案: