我正在尝试使用Stanford CoreNLP来执行Coref解析。我使用的版本是stanford-corenlp-full-2015-12-09。基本上,我写过一些课程:
import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
public class CorefResolution {
public static String corefResolute(String text, List<String> tokenToReplace) {
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation doc = new Annotation(text);
pipeline.annotate(doc);
Map<Integer, CorefChain> corefs = doc.get(CorefCoreAnnotations.CorefChainAnnotation.class);
System.out.println(corefs);
List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
List<String> resolved = new ArrayList<String>();
for (CoreMap sentence : sentences) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (CoreLabel token : tokens) {
Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
token.get(Coref)
if (corefClustId == null) {
System.out.println("NULL NULL NULL\n");
resolved.add(token.word());
continue;
}
else {
System.out.println("Exist Exist Exist\n");
}
System.out.println("coreClustId is "+corefClustId.toString()+"\n");
CorefChain chain = corefs.get(corefClustId);
if (chain == null || chain.getMentionsInTextualOrder().size() == 1) {
resolved.add(token.word());
} else {
int sentINdx = chain.getRepresentativeMention().sentNum - 1;
CoreMap corefSentence = sentences.get(sentINdx);
List<CoreLabel> corefSentenceTokens = corefSentence.get(CoreAnnotations.TokensAnnotation.class);
CorefChain.CorefMention reprMent = chain.getRepresentativeMention();
if (tokenToReplace.contains(token.word())) {
for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
resolved.add(matchedLabel.word());
}
} else {
resolved.add(token.word());
}
}
}
}
Detokenizer detokenizer = new Detokenizer();
String resolvedStr = detokenizer.detokenize(resolved);
return resolvedStr;
}
}
另一个班级
import java.util.Arrays;
import java.util.List;
import java.util.LinkedList;
public class Detokenizer {
public String detokenize(List<String> tokens) {
//Define list of punctuation characters that should NOT have spaces before or after
List<String> noSpaceBefore = new LinkedList<String>(Arrays.asList(",", ".",";", ":", ")", "}", "]", "'", "'s", "n't"));
List<String> noSpaceAfter = new LinkedList<String>(Arrays.asList("(", "[","{", "\"",""));
StringBuilder sentence = new StringBuilder();
tokens.add(0, ""); //Add an empty token at the beginning because loop checks as position-1 and "" is in noSpaceAfter
for (int i = 1; i < tokens.size(); i++) {
if (noSpaceBefore.contains(tokens.get(i))
|| noSpaceAfter.contains(tokens.get(i - 1))) {
sentence.append(tokens.get(i));
} else {
sentence.append(" " + tokens.get(i));
}
// Assumption that opening double quotes are always followed by matching closing double quotes
// This block switches the " to the other set after each occurrence
// ie The first double quotes should have no space after, then the 2nd double quotes should have no space before
if ("\"".equals(tokens.get(i - 1))) {
if (noSpaceAfter.contains("\"")) {
noSpaceAfter.remove("\"");
noSpaceBefore.add("\"");
} else {
noSpaceAfter.add("\"");
noSpaceBefore.remove("\"");
}
}
}
return sentence.toString();
}
}
另一个类文件
import java.io.*;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;
public class PlainTextCorefResolver {
public static void resolveFile(File inputFile, File outputFile) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), Charset.forName("UTF-8")));
PrintWriter writer = new PrintWriter(outputFile, "UTF-8");
if (inputFile.exists()) System.out.println("input exist\n");
else System.out.println("input not exist\n");
if (outputFile.exists()) System.out.println("output exist\n");
else System.out.println("output not exist\n");
while(true){
String line = reader.readLine();
//EOF
if(line == null)
break;
//Resolve line
List<String> tokenToReplace = Arrays.asList("He", "he", "She", "she", "It", "it", "They", "they"); //!!!
String resolvedLine = CorefResolution.corefResolute(line, tokenToReplace);
writer.println(resolvedLine);
}
reader.close();
writer.close();
} catch (Exception e){
System.err.println("Failed to open/resolve input file [" +inputFile.getAbsoluteFile()+ "] in loader");
e.printStackTrace();
return;
}
}
public static void main(String[] args) {
String inputFileName = "path/file.txt";
String outputFileName = "path/file.resolved.txt";
File inputFile = new File(inputFileName);
File outputFile = new File(outputFileName);
resolveFile(inputFile, outputFile);
}
}
但是,它没有给出任何有用的结果。 corefClusterId总是为null,因此我总是得到一堆“NULL NULL NULL”输出。
如何以最典型的提及(个人或组织名称)正确执行共识解决方案以取代诸如“他/他/她/她/它/它/体育场/ ......”?
例如,给定: “Estadio El Madrigal是西班牙的一个体育场,从1923年开始使用。目前它主要用于足球比赛。” 我想得到 “Estadio El Madrigal是西班牙的一个体育场,自1923年开始使用.Estadio El Madrigal目前主要用于足球比赛。”
答案 0 :(得分:2)
我不认为我们的核心系统正在附加&#34; Estadio El Madrigal&#34;到&#34;它&#34;在你的例子中。
以下是一些用于访问CorefChains和一般提及的示例代码。
import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
import edu.stanford.nlp.hcoref.data.CorefChain;
import edu.stanford.nlp.hcoref.data.Mention;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import java.util.*;
public class CorefExample {
public static void main(String[] args) throws Exception {
Annotation document = new Annotation("John Kerry is the secretary of state. He ran for president in 2004.");
Properties props = new Properties();
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,mention,coref");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
pipeline.annotate(document);
System.out.println("---");
System.out.println("coref chains");
for (CorefChain cc : document.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) {
System.out.println("\t"+cc);
System.out.println(cc.getMentionMap());
List<CorefChain.CorefMention> corefMentions = cc.getMentionsInTextualOrder();
for (CorefChain.CorefMention cm : corefMentions) {
System.out.println("---");
System.out.println("full text: "+cm.mentionSpan);
System.out.println("position: "+cm.position);
System.out.println("start index of first word: "+cm.startIndex);
}
}
for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
System.out.println("---");
System.out.println("mentions");
for (Mention m : sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class)) {
System.out.println("\t"+m);
}
}
}
}
======================
的更新强>
@StanfordNLPHelper,使用&#34; coref&#34;时出现错误。而不是&#34; dcoref&#34;:
INFO: Read 25 rules
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ...
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator mention
Using mention detector type: rule
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator coref
Exception in thread "main" java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.Arrays.copyOfRange(Arrays.java:3664)
at java.lang.String.<init>(String.java:207)
at java.lang.StringBuilder.toString(StringBuilder.java:407)
at java.io.ObjectInputStream$BlockDataInputStream.readUTFBody(ObjectInputStream.java:3079)
at java.io.ObjectInputStream$BlockDataInputStream.readUTF(ObjectInputStream.java:2874)
at java.io.ObjectInputStream.readString(ObjectInputStream.java:1639)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1342)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
at java.util.HashMap.readObject(HashMap.java:1394)
at sun.reflect.GeneratedMethodAccessor2.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1017)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1900)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2000)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1924)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
at edu.stanford.nlp.io.IOUtils.readObjectFromURLOrClasspathOrFileSystem(IOUtils.java:324)
at edu.stanford.nlp.scoref.SimpleLinearClassifier.<init>(SimpleLinearClassifier.java:30)
at edu.stanford.nlp.scoref.PairwiseModel.<init>(PairwiseModel.java:75)
at edu.stanford.nlp.scoref.PairwiseModel$Builder.build(PairwiseModel.java:57)
at edu.stanford.nlp.scoref.ClusteringCorefSystem.<init>(ClusteringCorefSystem.java:31)
at edu.stanford.nlp.scoref.StatisticalCorefSystem.fromProps(StatisticalCorefSystem.java:48)
at edu.stanford.nlp.pipeline.CorefAnnotator.<init>(CorefAnnotator.java:66)
at edu.stanford.nlp.pipeline.AnnotatorImplementations.coref(AnnotatorImplementations.java:220)
at edu.stanford.nlp.pipeline.AnnotatorFactories$13.create(AnnotatorFactories.java:515)
at edu.stanford.nlp.pipeline.AnnotatorPool.get(AnnotatorPool.java:85)
at edu.stanford.nlp.pipeline.StanfordCoreNLP.construct(StanfordCoreNLP.java:375)
Process finished with exit code 1