我正在运行.NET 4.5和Lucene.Net 3.0.3,并试图“修复”{* 1}}的变音符号行为(因为ASCIIFoldingFilter
转换为ä
而不是{ {1}} - 使用例如a
和ae
的搜索应该是相同的。)
我已经实现了自己的分析器:
Geschäft
现在我尝试添加Geschaeft
- 例如:
public sealed class LowerCaseKeywordAnalyzer : Lucene.Net.Analysis.KeywordAnalyzer
{
public override Lucene.Net.Analysis.TokenStream TokenStream(string fieldName, TextReader reader)
{
var keywordTokenizer = base.TokenStream(fieldName, reader);
var lowerCaseFilter = new Lucene.Net.Analysis.LowerCaseFilter(keywordTokenizer);
var asciiFoldingFilter = new Lucene.Net.Analysis.ASCIIFoldingFilter(lowerCaseFilter);
return asciiFoldingFilter;
}
}
但是如何注入Lucene.Net.Analysis.MappingCharFilter
- 或public sealed class LowerCaseKeywordAnalyzer : Lucene.Net.Analysis.KeywordAnalyzer
{
public override Lucene.Net.Analysis.TokenStream TokenStream(string fieldName, TextReader reader)
{
var keywordTokenizer = base.TokenStream(fieldName, reader);
var lowerCaseFilter = new Lucene.Net.Analysis.LowerCaseFilter(keywordTokenizer);
var mappingCharFilter = new Lucene.Net.Analysis.MappingCharFilter(/* get map from somewhere*/, ???);
var asciiFoldingFilter = new Lucene.Net.Analysis.ASCIIFoldingFilter(lowerCaseFilter);
return asciiFoldingFilter;
}
}
- 实例?我只有CharStream
- 个实例(TextReader
或Lucene.Net.Analysis.TokenStream
)...
除了编写完成工作的自定义Lucene.Net.Analysis.LowerCaseFilter
之外,有没有机会让它工作?
答案 0 :(得分:2)
我已经实现了自己的Lucene.Net.Analysis.TokenFilter
:
public sealed class UmlautsFoldingFilter : Lucene.Net.Analysis.TokenFilter
{
private readonly Lucene.Net.Analysis.Tokenattributes.ITermAttribute _termAttribute;
private char[] _output = new char[512];
private int _outputPosition;
public UmlautsFoldingFilter(Lucene.Net.Analysis.TokenStream input)
: base(input)
{
this._termAttribute = this.AddAttribute<Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
}
public override bool IncrementToken()
{
if (!this.input.IncrementToken())
{
return false;
}
var termBuffer = this._termAttribute.TermBuffer();
var termLength = this._termAttribute.TermLength();
this.FoldUmlaut(termBuffer,
termLength);
this._termAttribute.SetTermBuffer(this._output,
0,
this._outputPosition);
return true;
}
private void FoldUmlaut(char[] termBuffer,
int termLength)
{
var targetSize = 4 * termLength;
if (this._output.Length < targetSize)
{
this._output = new char[Lucene.Net.Util.ArrayUtil.GetNextSize(targetSize)];
}
this._outputPosition = 0;
for (var index = 0;
index < termLength;
++index)
{
var ch = termBuffer[index];
switch (ch)
{
case 'Ä':
this._output[this._outputPosition++] = 'A';
this._output[this._outputPosition++] = 'E';
continue;
case 'Ö':
this._output[this._outputPosition++] = 'O';
this._output[this._outputPosition++] = 'E';
continue;
case 'Ü':
this._output[this._outputPosition++] = 'U';
this._output[this._outputPosition++] = 'E';
continue;
case 'ä':
this._output[this._outputPosition++] = 'a';
this._output[this._outputPosition++] = 'e';
continue;
case 'ö':
this._output[this._outputPosition++] = 'o';
this._output[this._outputPosition++] = 'e';
continue;
case 'ü':
this._output[this._outputPosition++] = 'u';
this._output[this._outputPosition++] = 'e';
continue;
default:
this._output[this._outputPosition++] = ch;
continue;
}
}
}
}
答案 1 :(得分:0)
这是一个旧问题的答案,但可能仍然相关,因为原始答案没有显示在Lucene.net中使用NormalizeCharMap。 在您的Analyzer类中,还覆盖InitReader,如下所示:
public override TextReader InitReader(string fieldName, TextReader reader)
{
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.Add("Ä", "AE");
builder.Add("ä", "ae");
builder.Add("Ö", "OE");
builder.Add("ö", "oe");
builder.Add("Ü", "UE");
builder.Add("ü", "ue");
return new MappingCharFilter(builder.Build(), reader);
}
这应该为您提供所需的替换/规范化。