使用Gate进行文本分割

时间:2015-05-11 20:50:14

标签: java gate

我正在尝试使用Java编写自己的程序,以便将一组文本文件分段为句子。我已经搜索了可用的NLP工具,我发现GATE但我无法使用它来仅使用管道进行分段。

  1. 如何限制管道功能的任何想法
  2. 任何可以帮助我编写程序的代码

1 个答案:

答案 0 :(得分:2)

改编自different answer

import gate.*;
import gate.creole.SerialAnalyserController;
import java.io.File;
import java.util.*;

public class Segmenter {
    public static void main(String[] args) throws Exception {
        Gate.setGateHome(new File("C:\\Program Files\\GATE_Developer_8.0"));
        Gate.init();
        regiterGatePlugin("ANNIE");

        SerialAnalyserController pipeline = (SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController");
        pipeline.add((ProcessingResource) Factory.createResource("gate.creole.tokeniser.DefaultTokeniser"));
        pipeline.add((ProcessingResource) Factory.createResource("gate.creole.splitter.SentenceSplitter"));

        Corpus corpus = Factory.newCorpus("SegmenterCorpus");
        Document document = Factory.newDocument("Text to be segmented.");
        corpus.add(document); 
        pipeline.setCorpus(corpus); 
        pipeline.execute();

        AnnotationSet defaultAS = document.getAnnotations();
        AnnotationSet sentences = defaultAS.get("Sentence");

        for (Annotation sentence : sentences) {
            System.err.println(Utils.stringFor(document, sentence));
        }

        //Clean up
        Factory.deleteResource(document);
        Factory.deleteResource(corpus);
        for (ProcessingResource pr : pipeline.getPRs()) {
            Factory.deleteResource(pr);
        }
        Factory.deleteResource(pipeline);
    }

    public static void regiterGatePlugin(String name) throws Exception {
        Gate.getCreoleRegister().registerDirectories(new File(Gate.getPluginsHome(), name).toURI().toURL());
    }
}