使用Stanford CoreNLP进行CorefResolution

时间:2016-03-24 16:10:20

标签: java nlp stanford-nlp

我正在尝试使用Stanford CoreNLP来执行Coref解析。我使用的版本是stanford-corenlp-full-2015-12-09。基本上,我写过一些课程:

import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;


public class CorefResolution {
    public static String corefResolute(String text, List<String> tokenToReplace) {
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

        Annotation doc = new Annotation(text);
        pipeline.annotate(doc);

        Map<Integer, CorefChain> corefs = doc.get(CorefCoreAnnotations.CorefChainAnnotation.class);
        System.out.println(corefs);
        List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
        List<String> resolved = new ArrayList<String>();

        for (CoreMap sentence : sentences) {
            List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);

            for (CoreLabel token : tokens) {

                Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
                token.get(Coref)

                if (corefClustId == null) {
                    System.out.println("NULL NULL NULL\n");
                    resolved.add(token.word());
                    continue;
                }
                else {
                    System.out.println("Exist Exist Exist\n");
                }

                System.out.println("coreClustId is "+corefClustId.toString()+"\n");
                CorefChain chain = corefs.get(corefClustId);

                if (chain == null || chain.getMentionsInTextualOrder().size() == 1) {
                    resolved.add(token.word());
                } else {
                    int sentINdx = chain.getRepresentativeMention().sentNum - 1;
                    CoreMap corefSentence = sentences.get(sentINdx);
                    List<CoreLabel> corefSentenceTokens = corefSentence.get(CoreAnnotations.TokensAnnotation.class);

                    CorefChain.CorefMention reprMent = chain.getRepresentativeMention();

                    if (tokenToReplace.contains(token.word())) {
                        for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
                            CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
                            resolved.add(matchedLabel.word());
                        }
                    } else {
                        resolved.add(token.word());
                    }
                }
            }
        }

        Detokenizer detokenizer = new Detokenizer();
        String resolvedStr = detokenizer.detokenize(resolved);

        return resolvedStr;
    }
}

另一个班级

import java.util.Arrays;
import java.util.List;
import java.util.LinkedList;


public class Detokenizer {

    public String detokenize(List<String> tokens) {
        //Define list of punctuation characters that should NOT have spaces before or after
        List<String> noSpaceBefore = new LinkedList<String>(Arrays.asList(",", ".",";", ":", ")", "}", "]", "'", "'s", "n't"));
        List<String> noSpaceAfter = new LinkedList<String>(Arrays.asList("(", "[","{", "\"",""));

        StringBuilder sentence = new StringBuilder();

        tokens.add(0, "");  //Add an empty token at the beginning because loop checks as position-1 and "" is in noSpaceAfter
        for (int i = 1; i < tokens.size(); i++) {
            if (noSpaceBefore.contains(tokens.get(i))
                    || noSpaceAfter.contains(tokens.get(i - 1))) {
                sentence.append(tokens.get(i));
            } else {
                sentence.append(" " + tokens.get(i));
            }

            // Assumption that opening double quotes are always followed by matching closing double quotes
            // This block switches the " to the other set after each occurrence
            // ie The first double quotes should have no space after, then the 2nd double quotes should have no space before
            if ("\"".equals(tokens.get(i - 1))) {
                if (noSpaceAfter.contains("\"")) {
                    noSpaceAfter.remove("\"");
                    noSpaceBefore.add("\"");
                } else {
                    noSpaceAfter.add("\"");
                    noSpaceBefore.remove("\"");
                }
            }
        }
        return sentence.toString();
    }
}

另一个类文件

import java.io.*;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;


public class PlainTextCorefResolver {

    public static void resolveFile(File inputFile, File outputFile) {
        try {
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), Charset.forName("UTF-8")));
            PrintWriter writer = new PrintWriter(outputFile, "UTF-8");


            if (inputFile.exists()) System.out.println("input exist\n");
            else System.out.println("input not exist\n");

            if (outputFile.exists()) System.out.println("output exist\n");
            else System.out.println("output not exist\n");

            while(true){
                String line = reader.readLine();
                //EOF
                if(line == null)
                    break;
                //Resolve line
                List<String> tokenToReplace = Arrays.asList("He", "he", "She", "she", "It", "it", "They", "they"); //!!!
                String resolvedLine = CorefResolution.corefResolute(line, tokenToReplace);
                writer.println(resolvedLine);
            }
            reader.close();
            writer.close();

        } catch (Exception e){
            System.err.println("Failed to open/resolve input file [" +inputFile.getAbsoluteFile()+ "] in loader");
            e.printStackTrace();
            return;
        }

    }


    public static void main(String[] args) {
        String inputFileName = "path/file.txt";
        String outputFileName =  "path/file.resolved.txt";
        File inputFile = new File(inputFileName);
        File outputFile = new File(outputFileName);
        resolveFile(inputFile, outputFile);
    }

}

但是,它没有给出任何有用的结果。 corefClusterId总是为null,因此我总是得到一堆“NULL NULL NULL”输出。

如何以最典型的提及(个人或组织名称)正确执行共识解决方案以取代诸如“他/他/她/她/它/它/体育场/ ......”?

例如,给定: “Estadio El Madrigal是西班牙的一个体育场,从1923年开始使用。目前它主要用于足球比赛。” 我想得到 “Estadio El Madrigal是西班牙的一个体育场,自1923年开始使用.Estadio El Madrigal目前主要用于足球比赛。”

1 个答案:

答案 0 :(得分:2)

我不认为我们的核心系统正在附加&#34; Estadio El Madrigal&#34;到&#34;它&#34;在你的例子中。

以下是一些用于访问CorefChains和一般提及的示例代码。

import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
import edu.stanford.nlp.hcoref.data.CorefChain;
import edu.stanford.nlp.hcoref.data.Mention;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.util.*;

public class CorefExample {

    public static void main(String[] args) throws Exception {

        Annotation document = new Annotation("John Kerry is the secretary of state.  He ran for president in 2004.");
        Properties props = new Properties();
        props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,mention,coref");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        pipeline.annotate(document);
        System.out.println("---");
        System.out.println("coref chains");
        for (CorefChain cc : document.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) {
            System.out.println("\t"+cc);
            System.out.println(cc.getMentionMap());
            List<CorefChain.CorefMention> corefMentions = cc.getMentionsInTextualOrder();
            for (CorefChain.CorefMention cm : corefMentions) {
                System.out.println("---");
                System.out.println("full text: "+cm.mentionSpan);
                System.out.println("position: "+cm.position);
                System.out.println("start index of first word: "+cm.startIndex);
            }
        }
        for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
            System.out.println("---");
            System.out.println("mentions");
            for (Mention m : sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class)) {
                System.out.println("\t"+m);
            }
        }
    }
}

======================
更新
@StanfordNLPHelper,使用&#34; coref&#34;时出现错误。而不是&#34; dcoref&#34;:

INFO: Read 25 rules
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator mention
Using mention detector type: rule
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator coref
Exception in thread "main" java.lang.OutOfMemoryError: GC overhead limit exceeded
    at java.util.Arrays.copyOfRange(Arrays.java:3664)
    at java.lang.String.<init>(String.java:207)
    at java.lang.StringBuilder.toString(StringBuilder.java:407)
    at java.io.ObjectInputStream$BlockDataInputStream.readUTFBody(ObjectInputStream.java:3079)
    at java.io.ObjectInputStream$BlockDataInputStream.readUTF(ObjectInputStream.java:2874)
    at java.io.ObjectInputStream.readString(ObjectInputStream.java:1639)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1342)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
    at java.util.HashMap.readObject(HashMap.java:1394)
    at sun.reflect.GeneratedMethodAccessor2.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:497)
    at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1017)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1900)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2000)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1924)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
    at edu.stanford.nlp.io.IOUtils.readObjectFromURLOrClasspathOrFileSystem(IOUtils.java:324)
    at edu.stanford.nlp.scoref.SimpleLinearClassifier.<init>(SimpleLinearClassifier.java:30)
    at edu.stanford.nlp.scoref.PairwiseModel.<init>(PairwiseModel.java:75)
    at edu.stanford.nlp.scoref.PairwiseModel$Builder.build(PairwiseModel.java:57)
    at edu.stanford.nlp.scoref.ClusteringCorefSystem.<init>(ClusteringCorefSystem.java:31)
    at edu.stanford.nlp.scoref.StatisticalCorefSystem.fromProps(StatisticalCorefSystem.java:48)
    at edu.stanford.nlp.pipeline.CorefAnnotator.<init>(CorefAnnotator.java:66)
    at edu.stanford.nlp.pipeline.AnnotatorImplementations.coref(AnnotatorImplementations.java:220)
    at edu.stanford.nlp.pipeline.AnnotatorFactories$13.create(AnnotatorFactories.java:515)
    at edu.stanford.nlp.pipeline.AnnotatorPool.get(AnnotatorPool.java:85)
    at edu.stanford.nlp.pipeline.StanfordCoreNLP.construct(StanfordCoreNLP.java:375)

Process finished with exit code 1