我是java新手,对我的要求是阅读文档并执行命名实体文档。对于简单的字符串,我做了以下
InputStream is = new FileInputStream("data/en-ner-person.bin");
TokenNameFinderModel model = new TokenNameFinderModel(is);
is.close();
NameFinderME nameFinder = new NameFinderME(model);
String []sentence = new String[]{"Smith",
"Smithosian",
"is",
"a",
"person"
};
Span nameSpans[] = nameFinder.find(sentence);
但是,我需要实际从文档中读取流,然后生成XML。 任何人都可以告诉我该怎么做
由于
答案 0 :(得分:4)
没有人回答这个问题所以我希望现在还不晚。
对于实体提取,您需要以String格式提供文档文本。检查stackoverflow以获取将doc文本发送到String的多种方法(这里的简短回答是对文本文件使用BufferedInputStream,对MS和PDF文件使用Apache Tika)
一旦你在内存中有了doc文本,这段代码应该为你提供Sentence边界检测,标记化和NER。然后获取结果并使用docname / docid生成xmlDoc,可能是一些文件元数据,实际的实体字符串,类型和Span(文本中NE命中的位置)
这门课应该让你入门
package processors;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
public class OpenNLPNER implements Runnable
{
static TokenizerModel tm = null;
static TokenNameFinderModel locModel = null;
String doc;
NameFinderME myNameFinder;
TokenizerME wordBreaker;
SentenceDetector sd;
public OpenNLPNER()
{
}
public OpenNLPNER(String document, SentenceDetector sd, NameFinderME mf, TokenizerME wordBreaker)
{
System.out.println("got doc");
this.sd = sd;
this.myNameFinder = mf;
this.wordBreaker = wordBreaker;
doc = document;
}
private static List<String> getMyDocsFromSomewhere()
{
//this should return an object that has all the info about the doc you want
return new ArrayList<String>();
}
public static void main(String[] args)
{
try
{
String modelPath = "c:\\temp\\opennlpmodels\\";
if (tm == null)
{
//user does normal namefinder instantiations...
InputStream stream = new FileInputStream(new File(modelPath + "en-token.zip"));
// new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip"))));
tm = new TokenizerModel(stream);
// new TokenizerME(tm);
locModel = new TokenNameFinderModel(new FileInputStream(new File(modelPath + "en-ner-location.bin")));
// new NameFinderME(locModel);
}
System.out.println("getting data");
List<String> docs = getMyDocsFromSomewhere();
System.out.println("\tdone getting data");
// FileWriter fw = new FileWriter("C:\\apache\\modelbuilder\\sentences.txt");
for (String docu : docs)
{
//you could also use the runnable here and launch in a diff thread
new OpenNLPNER(docu,
new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip")))),
new NameFinderME(locModel), new TokenizerME(tm)).run();
}
System.out.println("done");
} catch (Exception ex)
{
System.out.println(ex);
}
}
@Override
public void run()
{
try
{
process(doc);
} catch (Exception ex)
{
System.out.println(ex);
}
}
public void process(String document) throws Exception
{
// System.out.println(document);
//user instantiates the non static entitylinkerproperty object and constructs is with a pointer to the prop file they need to use
String modelPath = "C:\\apache\\entitylinker\\";
//input document
myNameFinder.clearAdaptiveData();
//user splits doc to sentences
String[] sentences = sd.sentDetect(document);
//get the sentence spans
Span[] sentenceSpans = sd.sentPosDetect(document);
Span[][] allnamesInDoc = new Span[sentenceSpans.length][];
String[][] allTokensInDoc = new String[sentenceSpans.length][];
for (int sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++)
{
String[] stringTokens = wordBreaker.tokenize(sentences[sentenceIndex]);
Span[] tokenSpans = wordBreaker.tokenizePos(sentences[sentenceIndex]);
Span[] spans = myNameFinder.find(stringTokens);
allnamesInDoc[sentenceIndex] = spans;
allTokensInDoc[sentenceIndex] = stringTokens;
}
//now access the data like this...
for (int s = 0; s < sentenceSpans.length; s++)
{
Span[] namesInSentence = allnamesInDoc[s];
String[] tokensInSentence = allTokensInDoc[s];
String[] entities = Span.spansToStrings(namesInSentence, tokensInSentence);
for (String entity : entities)
{
//start building up the XML here....
System.out.println(entity + " Was in setnence " + s + " @ " + namesInSentence[s].toString());
}
}
}
}