如何将MappingCharFilter与Lucene.Net一起使用

时间:2014-02-24 08:41:53

标签: lucene.net

我正在运行.NET 4.5和Lucene.Net 3.0.3,并试图“修复”{* 1}}的变音符号行为(因为ASCIIFoldingFilter转换为ä而不是{ {1}} - 使用例如aae的搜索应该是相同的。)

我已经实现了自己的分析器:

Geschäft

现在我尝试添加Geschaeft - 例如:

public sealed class LowerCaseKeywordAnalyzer : Lucene.Net.Analysis.KeywordAnalyzer
{
    public override Lucene.Net.Analysis.TokenStream TokenStream(string fieldName, TextReader reader)
    {
        var keywordTokenizer = base.TokenStream(fieldName, reader);
        var lowerCaseFilter = new Lucene.Net.Analysis.LowerCaseFilter(keywordTokenizer);
        var asciiFoldingFilter = new Lucene.Net.Analysis.ASCIIFoldingFilter(lowerCaseFilter);

        return asciiFoldingFilter;
    }
}

但是如何注入Lucene.Net.Analysis.MappingCharFilter - 或public sealed class LowerCaseKeywordAnalyzer : Lucene.Net.Analysis.KeywordAnalyzer { public override Lucene.Net.Analysis.TokenStream TokenStream(string fieldName, TextReader reader) { var keywordTokenizer = base.TokenStream(fieldName, reader); var lowerCaseFilter = new Lucene.Net.Analysis.LowerCaseFilter(keywordTokenizer); var mappingCharFilter = new Lucene.Net.Analysis.MappingCharFilter(/* get map from somewhere*/, ???); var asciiFoldingFilter = new Lucene.Net.Analysis.ASCIIFoldingFilter(lowerCaseFilter); return asciiFoldingFilter; } } - 实例?我只有CharStream - 个实例(TextReaderLucene.Net.Analysis.TokenStream)...
除了编写完成工作的自定义Lucene.Net.Analysis.LowerCaseFilter之外,有没有机会让它工作?

2 个答案:

答案 0 :(得分:2)

我已经实现了自己的Lucene.Net.Analysis.TokenFilter

public sealed class UmlautsFoldingFilter : Lucene.Net.Analysis.TokenFilter
{
    private readonly Lucene.Net.Analysis.Tokenattributes.ITermAttribute _termAttribute;
    private char[] _output = new char[512];
    private int _outputPosition;

    public UmlautsFoldingFilter(Lucene.Net.Analysis.TokenStream input)
        : base(input)
    {
        this._termAttribute = this.AddAttribute<Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
    }

    public override bool IncrementToken()
    {
        if (!this.input.IncrementToken())
        {
            return false;
        }

        var termBuffer = this._termAttribute.TermBuffer();
        var termLength = this._termAttribute.TermLength();

        this.FoldUmlaut(termBuffer,
                        termLength);

        this._termAttribute.SetTermBuffer(this._output,
                                          0,
                                          this._outputPosition);

        return true;
    }

    private void FoldUmlaut(char[] termBuffer,
                            int termLength)
    {
        var targetSize = 4 * termLength;
        if (this._output.Length < targetSize)
        {
            this._output = new char[Lucene.Net.Util.ArrayUtil.GetNextSize(targetSize)];
        }
        this._outputPosition = 0;
        for (var index = 0;
             index < termLength;
             ++index)
        {
            var ch = termBuffer[index];
            switch (ch)
            {
                case 'Ä':
                    this._output[this._outputPosition++] = 'A';
                    this._output[this._outputPosition++] = 'E';
                    continue;
                case 'Ö':
                    this._output[this._outputPosition++] = 'O';
                    this._output[this._outputPosition++] = 'E';
                    continue;
                case 'Ü':
                    this._output[this._outputPosition++] = 'U';
                    this._output[this._outputPosition++] = 'E';
                    continue;
                case 'ä':
                    this._output[this._outputPosition++] = 'a';
                    this._output[this._outputPosition++] = 'e';
                    continue;
                case 'ö':
                    this._output[this._outputPosition++] = 'o';
                    this._output[this._outputPosition++] = 'e';
                    continue;
                case 'ü':
                    this._output[this._outputPosition++] = 'u';
                    this._output[this._outputPosition++] = 'e';
                    continue;
                default:
                    this._output[this._outputPosition++] = ch;
                    continue;
            }
        }
    }
}

答案 1 :(得分:0)

这是一个旧问题的答案,但可能仍然相关,因为原始答案没有显示在Lucene.net中使用NormalizeCharMap。 在您的Analyzer类中,还覆盖InitReader,如下所示:

public override TextReader InitReader(string fieldName, TextReader reader)
    {
        NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
        builder.Add("Ä", "AE");
        builder.Add("ä", "ae");
        builder.Add("Ö", "OE");
        builder.Add("ö", "oe");
        builder.Add("Ü", "UE");
        builder.Add("ü", "ue");
        return new MappingCharFilter(builder.Build(), reader);
     }

这应该为您提供所需的替换/规范化。