仅在第一个时调用自定义标记生成器solr

时间:2012-04-17 04:31:21

标签: java plugins solr lucene tokenize

我创建了一个自定义标记生成器,通过使用admin / analysis.jsp和system.out日志检查它似乎工作正常。但是,当我在使用此自定义标记生成器的字段中执行查询时,我看到仅为第一个查询字符串调用自定义标记化器solr(由system.out日志检查)。 你能帮我指出我错了吗? 这些是我的代码:

package com.fosp.searchengine;
import java.io.Reader;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.analysis.WhitespaceTokenizerFactory;

public class JvnTextProTokenizerFactory extends WhitespaceTokenizerFactory{
    @Override
    public WhitespaceTokenizer create(Reader input) {
        System.out.println("WhitespaceTokenizer create(Reader input)");
        Reader processedStringReader = new ProcessedStringReader(input);
        return new WhitespaceTokenizer(processedStringReader);
    }

}


package com.fosp.searchengine;
import java.io.IOException;
import java.io.Reader;

public class ProcessedStringReader extends java.io.Reader {

    private static final int BUFFER_SIZE = 1024 * 8;
    private static TextProcess m_textProcess = null;
    private char[] m_inputData = null;
    private int m_offset = 0;
    private int m_length = 0;
    public ProcessedStringReader(Reader input){
        char[] arr = new char[BUFFER_SIZE];
        StringBuffer buf = new StringBuffer();
        int numChars;

        try {
            while ((numChars = input.read(arr, 0, arr.length)) > 0) {
                buf.append(arr, 0, numChars);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        if(m_textProcess == null){
            try {
                m_textProcess = new TextProcess();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        m_inputData = m_textProcess.processText(buf.toString()).toCharArray();
        m_offset = 0;
        m_length = m_inputData.length;
    }

    @Override
    public int read(char[] cbuf, int off, int len) throws IOException {
        int charNumber = 0;
        for(int i = m_offset + off;i<m_length && charNumber< len; i++){
            cbuf[charNumber] = m_inputData[i];
            m_offset ++;
            charNumber++;
        }
        if(charNumber == 0){
            return -1;
        }
        return charNumber;
    }

    @Override
    public void close() throws IOException {
        m_inputData = null;
        m_offset = 0;
        m_length = 0;
    }

}

Schema.xml的

<fieldType name="text_jvnTextPro" class="solr.TextField" positionIncrementGap="100">
  <analyzer type="index">
        <tokenizer class="com.fosp.searchengine.JvnTextProTokenizerFactory"/>
    <filter class="solr.LowerCaseFilterFactory"/>
  </analyzer>
  <analyzer type="query">
        <tokenizer class="com.fosp.searchengine.JvnTextProTokenizerFactory"/>
    <filter class="solr.LowerCaseFilterFactory"/>       
  </analyzer>
</fieldType>

1 个答案:

答案 0 :(得分:0)

这里没有错。工厂实例化的类被重用。这与分析/管理页面不同。区别在于。