使用Stanford CoreNLP(3.5.2)进行并发处理

时间:2015-06-05 21:44:01

标签: multithreading concurrency stanford-nlp

我在同时注释多个句子时面临并发问题。我不清楚我是做错了什么,或者CoreNLP中是否有错误。

我的目标是使用并行运行的多个线程使用管道“tokenize,ssplit,pos,lemma,ner,parse,dcoref”来注释句子。每个线程分配自己的StanfordCoreNLP实例,然后将其用于注释。

问题是在某些时候会引发异常:

java.util.ConcurrentModificationException
	at java.util.ArrayList$Itr.checkForComodification(ArrayList.java:901)
	at java.util.ArrayList$Itr.next(ArrayList.java:851)
	at java.util.Collections$UnmodifiableCollection$1.next(Collections.java:1042)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:463)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.<init>(GrammaticalStructure.java:201)
	at edu.stanford.nlp.trees.EnglishGrammaticalStructure.<init>(EnglishGrammaticalStructure.java:89)
	at edu.stanford.nlp.semgraph.SemanticGraphFactory.makeFromTree(SemanticGraphFactory.java:139)
	at edu.stanford.nlp.pipeline.DeterministicCorefAnnotator.annotate(DeterministicCorefAnnotator.java:89)
	at edu.stanford.nlp.pipeline.AnnotationPipeline.annotate(AnnotationPipeline.java:68)
	at edu.stanford.nlp.pipeline.StanfordCoreNLP.annotate(StanfordCoreNLP.java:412)

我正在附加一个应用程序的示例代码,该代码在我的Core i3 370M笔记本电脑(Win 7 64位,Java 1.8.0.45 64位)上大约20秒内重现问题。此应用程序读取识别文本蕴涵(RTE)语料库的XML文件,然后使用标准Java并发类同时解析所有句子。需要将本地RTE XML文件的路径作为命令行参数提供。在我的测试中,我在这里使用了公开的XML文件: http://www.nist.gov/tac/data/RTE/RTE3-DEV-FINAL.tar.gz

package semante.parser.stanford.server;

import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.Unmarshaller;
import javax.xml.bind.annotation.XmlAccessType;
import javax.xml.bind.annotation.XmlAccessorType;
import javax.xml.bind.annotation.XmlAttribute;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;

import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;

public class StanfordMultiThreadingTest {

	@XmlRootElement(name = "entailment-corpus")
	@XmlAccessorType (XmlAccessType.FIELD)
	public static class Corpus {
		@XmlElement(name = "pair")
		private List<Pair> pairList = new ArrayList<Pair>();

		public void addPair(Pair p) {pairList.add(p);}
		public List<Pair> getPairList() {return pairList;}
	}

	@XmlRootElement(name="pair")
	public static class Pair {

		@XmlAttribute(name = "id")
		String id;

		@XmlAttribute(name = "entailment")
		String entailment;

		@XmlElement(name = "t")
		String t;

		@XmlElement(name = "h")
		String h;

		private Pair() {}

		public Pair(int id, boolean entailment, String t, String h) {
			this();
			this.id = Integer.toString(id);
			this.entailment = entailment ? "YES" : "NO";
			this.t = t;
			this.h = h;
		}

		public String getId() {return id;}
		public String getEntailment() {return entailment;}
		public String getT() {return t;}
		public String getH() {return h;}
	}
	
	class NullStream extends OutputStream {
		@Override 
		public void write(int b) {}
	};

	private Corpus corpus;
	private Unmarshaller unmarshaller;
	private ExecutorService executor;

	public StanfordMultiThreadingTest() throws Exception {
		javax.xml.bind.JAXBContext jaxbCtx = JAXBContext.newInstance(Pair.class,Corpus.class);
		unmarshaller = jaxbCtx.createUnmarshaller();
		executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
	}

	public void readXML(String fileName) throws Exception {
		System.out.println("Reading XML - Started");
		corpus = (Corpus) unmarshaller.unmarshal(new InputStreamReader(new FileInputStream(fileName), StandardCharsets.UTF_8));
		System.out.println("Reading XML - Ended");
	}

	public void parseSentences() throws Exception {
		System.out.println("Parsing - Started");

		// turn pairs into a list of sentences
		List<String> sentences = new ArrayList<String>();
		for (Pair pair : corpus.getPairList()) {
			sentences.add(pair.getT());
			sentences.add(pair.getH());
		}

		// prepare the properties
		final Properties props = new Properties();
		props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");

		// first run is long since models are loaded
		new StanfordCoreNLP(props);

		// to avoid the CoreNLP initialization prints (e.g. "Adding annotation pos")
		final PrintStream nullPrintStream = new PrintStream(new NullStream());
		PrintStream err = System.err;
		System.setErr(nullPrintStream);

		int totalCount = sentences.size();
		AtomicInteger counter = new AtomicInteger(0);

		// use java concurrency to parallelize the parsing
		for (String sentence : sentences) {
			executor.execute(new Runnable() {
				@Override
				public void run() {
					try {
						StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
						Annotation annotation = new Annotation(sentence);
						pipeline.annotate(annotation);
						if (counter.incrementAndGet() % 20 == 0) {
							System.out.println("Done: " + String.format("%.2f", counter.get()*100/(double)totalCount));
						};
					} catch (Exception e) {
						System.setErr(err);
						e.printStackTrace();
						System.setErr(nullPrintStream);
						executor.shutdownNow();
					}
				}
			});
		}
		executor.shutdown();
		
		System.out.println("Waiting for parsing to end.");		
		executor.awaitTermination(10, TimeUnit.MINUTES);

		System.out.println("Parsing - Ended");
	}

	public static void main(String[] args) throws Exception {
		StanfordMultiThreadingTest smtt = new StanfordMultiThreadingTest();
		smtt.readXML(args[0]);
		smtt.parseSentences();
	}

}

在我试图找到一些背景信息时,我遇到了来自斯坦福的Christopher ManningGabor Angeli给出的答案,这些答案表明当代版本的Stanford CoreNLP应该是线程安全的。但是,CoreNLP 3.4.1版上的最新bug report描述了并发问题。如标题中所述,我使用的是版本3.5.2。

我不清楚我面临的问题是由于错误还是因为我使用包的方式有问题。如果有更多知识渊博的人能够对此有所了解,我将不胜感激。我希望示例代码对于重现问题很有用。谢谢!

[1]:

2 个答案:

答案 0 :(得分:9)

您是否尝试过使用foo选项?您可以为单个threads管道指定多个线程,然后它将并行处理句子。

例如,如果您要处理8个核心的句子,请将StanfordCoreNLP选项设置为threads

8

尽管如此,我认为您的解决方案也应该有效,我们会检查是否存在并发错误,但在此期间使用此选项可能会解决您的问题。

答案 1 :(得分:0)

我有同样的问题,并使用最新的github修订版(今天)的构建解决了这个问题。所以我认为这是自3.5.2以来已经解决的CoreNLP问题。

另见CoreNLP on Apache Spark