如何在OpenNLP中阅读命名实体识别的文档

时间:2013-10-10 10:35:26

标签: java opennlp named-entity-recognition

我是java新手,对我的要求是阅读文档并执行命名实体文档。对于简单的字符串,我做了以下

InputStream is = new FileInputStream("data/en-ner-person.bin");
TokenNameFinderModel model = new TokenNameFinderModel(is);
is.close();
NameFinderME nameFinder = new NameFinderME(model);
String []sentence = new String[]{"Smith",
                "Smithosian",
                "is",
                "a",
                "person"
                };



   Span nameSpans[] = nameFinder.find(sentence);

但是,我需要实际从文档中读取流,然后生成XML。 任何人都可以告诉我该怎么做

由于

1 个答案:

答案 0 :(得分:4)

没有人回答这个问题所以我希望现在还不晚。

对于实体提取,您需要以String格式提供文档文本。检查stackoverflow以获取将doc文本发送到String的多种方法(这里的简短回答是对文本文件使用BufferedInputStream,对MS和PDF文件使用Apache Tika)

一旦你在内存中有了doc文本,这段代码应该为你提供Sentence边界检测,标记化和NER。然后获取结果并使用docname / docid生成xmlDoc,可能是一些文件元数据,实际的实体字符串,类型和Span(文本中NE命中的位置)

这门课应该让你入门

package processors;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;

public class OpenNLPNER implements Runnable
{

    static TokenizerModel tm = null;
    static TokenNameFinderModel locModel = null;
    String doc;
    NameFinderME myNameFinder;
    TokenizerME wordBreaker;
    SentenceDetector sd;

    public OpenNLPNER()
    {
    }

    public OpenNLPNER(String document, SentenceDetector sd, NameFinderME mf, TokenizerME wordBreaker)
    {
        System.out.println("got doc");
        this.sd = sd;
        this.myNameFinder = mf;
        this.wordBreaker = wordBreaker;
        doc = document;
    }

    private static List<String> getMyDocsFromSomewhere()
    {
        //this should return an object that has all the info about the doc you want
        return new ArrayList<String>();
    }

    public static void main(String[] args)
    {
        try
        {
            String modelPath = "c:\\temp\\opennlpmodels\\";

            if (tm == null)
            {
                //user does normal namefinder instantiations...
                InputStream stream = new FileInputStream(new File(modelPath + "en-token.zip"));
                // new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip"))));
                tm = new TokenizerModel(stream);
                // new TokenizerME(tm);
                locModel = new TokenNameFinderModel(new FileInputStream(new File(modelPath + "en-ner-location.bin")));
                //  new NameFinderME(locModel);
            }


            System.out.println("getting data");
            List<String> docs = getMyDocsFromSomewhere();
            System.out.println("\tdone getting data");
            // FileWriter fw = new FileWriter("C:\\apache\\modelbuilder\\sentences.txt");




            for (String docu : docs)
            {
                //you could also use the runnable here and launch in a diff thread
                new OpenNLPNER(docu,
                        new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip")))),
                        new NameFinderME(locModel), new TokenizerME(tm)).run();

            }

            System.out.println("done");


        } catch (Exception ex)
        {
            System.out.println(ex);
        }


    }

    @Override
    public void run()
    {
        try
        {
            process(doc);
        } catch (Exception ex)
        {
            System.out.println(ex);
        }
    }

    public void process(String document) throws Exception
    {

        //  System.out.println(document);
        //user instantiates the non static entitylinkerproperty object and constructs is with a pointer to the prop file they need to use
        String modelPath = "C:\\apache\\entitylinker\\";


        //input document
        myNameFinder.clearAdaptiveData();
        //user splits doc to sentences
        String[] sentences = sd.sentDetect(document);
        //get the sentence spans
        Span[] sentenceSpans = sd.sentPosDetect(document);
        Span[][] allnamesInDoc = new Span[sentenceSpans.length][];
        String[][] allTokensInDoc = new String[sentenceSpans.length][];

        for (int sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++)
        {
            String[] stringTokens = wordBreaker.tokenize(sentences[sentenceIndex]);
            Span[] tokenSpans = wordBreaker.tokenizePos(sentences[sentenceIndex]);
            Span[] spans = myNameFinder.find(stringTokens);
            allnamesInDoc[sentenceIndex] = spans;
            allTokensInDoc[sentenceIndex] = stringTokens;
        }

        //now access the data like this...
        for (int s = 0; s < sentenceSpans.length; s++)
        {
            Span[] namesInSentence = allnamesInDoc[s];
            String[] tokensInSentence = allTokensInDoc[s];
            String[] entities = Span.spansToStrings(namesInSentence, tokensInSentence);
            for (String entity : entities)
            {
                //start building up the XML here....
                System.out.println(entity + " Was in setnence " + s + " @ " + namesInSentence[s].toString());
            }
        }

    }
}