为中文标记化创建自定义插件

时间:2014-09-19 17:15:12

标签: plugins solr nlp stanford-nlp

我正在努力将SOLAN中的stanford分段器正确集成以进行中文标记化。

此插件涉及加载其他jar文件和模型文件。我通过硬编码文件的完整路径以粗暴的方式工作。

我正在寻找创建插件的方法,其中路径不需要硬编码,并且插件也符合SOLR插件架构。如果有任何推荐的网站或教程,请告诉我。

我在下面添加了我的代码:

公共类ChineseTokenizerFactory扩展了TokenizerFactory {

/** Creates a new WhitespaceTokenizerFactory */
public ChineseTokenizerFactory(Map<String,String> args) {
    super(args);
    assureMatchVersion();
    if (!args.isEmpty()) {
        throw new IllegalArgumentException("Unknown parameters: " + args);
    }
}

@Override
public ChineseTokenizer create(AttributeFactory factory, Reader input) {
    Reader processedStringReader = new ProcessedStringReader(input);
    return new ChineseTokenizer(luceneMatchVersion, factory, processedStringReader);
}

}

公共类ProcessedStringReader扩展了java.io.Reader {

private static final int BUFFER_SIZE = 1024 * 8;
//private static TextProcess m_textProcess = null;
private static final String basedir = "/home/praveen/PDS_Meetup/solr-4.9.0/custom_plugins/";
static Properties props = null;
static CRFClassifier<CoreLabel> segmenter = null;
private char[] m_inputData = null;
private int m_offset = 0;
private int m_length = 0;

public ProcessedStringReader(Reader input){
    char[] arr = new char[BUFFER_SIZE];
    StringBuffer buf = new StringBuffer();
    int numChars;

    if(segmenter == null)
    {
        segmenter = new CRFClassifier<CoreLabel>(getProperties());
        segmenter.loadClassifierNoExceptions(basedir + "ctb.gz", getProperties());
    }

    try {
        while ((numChars = input.read(arr, 0, arr.length)) > 0) {
            buf.append(arr, 0, numChars);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

    m_inputData = processText(buf.toString()).toCharArray();
    m_offset = 0;
    m_length = m_inputData.length;
}

@Override
public int read(char[] cbuf, int off, int len) throws IOException {
    int charNumber = 0;
    for(int i = m_offset + off;i<m_length && charNumber< len; i++){
        cbuf[charNumber] = m_inputData[i];
        m_offset ++;
        charNumber++;
    }
    if(charNumber == 0){
        return -1;
    }
    return charNumber;
}
@Override
public void close() throws IOException {
    m_inputData = null;
    m_offset = 0;
    m_length = 0;
}
public String processText(String inputText)
{
    List<String> segmented = segmenter.segmentString(inputText);
    String output = "";
    if(segmented.size() > 0)
    {
        output = segmented.get(0);
        for(int i=1;i<segmented.size();i++)
        {
            output = output + " " +segmented.get(i);
        }
    }
    System.out.println(output);
    return output;
}
static Properties getProperties()
{
    if (props == null) {
        props = new Properties();
        props.setProperty("sighanCorporaDict", basedir);
        // props.setProperty("NormalizationTable", "data/norm.simp.utf8");
        // props.setProperty("normTableEncoding", "UTF-8");
        // below is needed because CTBSegDocumentIteratorFactory accesses it
        props.setProperty("serDictionary",basedir+"dict-chris6.ser.gz");
        props.setProperty("inputEncoding", "UTF-8");
        props.setProperty("sighanPostProcessing", "true");
    }
    return props;
}

}

公共决赛班ChineseTokenizer扩展了CharTokenizer {

public ChineseTokenizer(Version matchVersion, Reader in) {
    super(matchVersion, in);
}
public ChineseTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
    super(matchVersion, factory, in);
}

/** Collects only characters which do not satisfy
 * {@link Character#isWhitespace(int)}.*/
@Override
protected boolean isTokenChar(int c) {
    return !Character.isWhitespace(c);
}

}

1 个答案:

答案 0 :(得分:0)

您可以通过Factory的args参数传递参数。