我正在使用lucene.net 3.0.3,我有一个简单的自定义分析器和标记器,它们通过TAB打破了这些术语。我测量了它,结果证明索引速度是使用StandardAnalyzer(它可以做更多事情)的两倍。你知道问题可能是什么,或者是否有更好的解决方案?
代码低于
public class CustomAnalyzer : Analyzer
{
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
return new CustomTokenizer(reader);
//return new LetterTokenizer(reader);
}
public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
{
Tokenizer tokenizer = this.PreviousTokenStream as Tokenizer;
if (tokenizer == null)
{
tokenizer = new CustomTokenizer(reader);
//tokenizer = new LetterTokenizer(reader);
}
else
{
tokenizer.Reset(reader);
}
return tokenizer;
}
}
public class CustomTokenizer : LetterTokenizer
{
public CustomTokenizer(TextReader reader)
: base(reader)
{ }
protected override char Normalize(char c)
{
return char.ToLower(c, CultureInfo.InvariantCulture);
}
protected override bool IsTokenChar(char c)
{
// TAB has the same code in Unicode
return c != '\x0009';
}
}
答案 0 :(得分:0)
我忘了更新这个帖子:问题出在自定义分析器中。我实际上没有将tokenizer保存到PreviousTokenStream,所以每次都在创建一个新的tokenizer。