在GATE中加载语言语料库

时间:2014-07-22 18:15:52

标签: nlp

我是新来的门。我想从文档创建一个语料库。我有大量的文档,因此每次都很难加载它们并手动创建语料库。有没有简单的方法可以直接创建语料库?

1 个答案:

答案 0 :(得分:0)

/**
 * Used to hold the data store saved the records for processing
 */
private static DataStore ProcessingDataStore = null;

/**
 * Holds the corpus for processing
 */
private static Corpus ProcessingCorpus = null;


private static void LoadSerialDataStore(String dataStoreDirPath)
        throws Exception {
    File dataStoreDirFile = new File(dataStoreDirPath);
    if (!dataStoreDirFile.exists()) {
        if (!dataStoreDirFile.mkdirs()) {
            logger.log(Level.WARNING,
                    "Data store directory creation false!");
            return;
        }
        ProcessingDataStore = (SerialDataStore) Factory.createDataStore(
                SerialDataStore.class.getName(), dataStoreDirFile.toURI()
                        .toString());
        ProcessingDataStore.open();
        ProcessingCorpus = (Corpus) ProcessingDataStore.adopt(Factory
                .newCorpus(""), null);
        ProcessingDataStore.sync(ProcessingCorpus);
    } else {
        ProcessingDataStore = (SerialDataStore) Factory.openDataStore(
                SerialDataStore.class.getName(), dataStoreDirFile.toURI()
                        .toString());
        ProcessingDataStore.open();
        ProcessingCorpus = CorpusUtil.loadSerialCorpus(ProcessingDataStore);
    }
}

private static void CreateSerialDataStore(String dataStoreDirPath){
      LoadSerialDataStore(dataStoreDirPath);
      Document tempDocument = Factory.newDocument(content);
      FeatureMap featureMap = Factory.newFeatureMap();
      tempDocument.setFeatures(featureMap);
      ProcessingCorpus.add(tempDocument);
      ProcessingCorpus.unloadDocument(tempDocument);
      Factory.deleteResource(tempDocument);
}