如何限制Lucene.Net仅索引长度大于x的这些术语。 我正在将文档编入索引:
String indexDirectory = @"C:\Users\user\Desktop\Index";
String dataDirectory = @"C:\Users\user\Desktop\Data";
StandardAnalyzer analyzer = new StandardAnalyzer();
IndexWriter writer = new IndexWriter(indexDirectory, analyzer);
Document doc = new Document();
Field fPath = new Lucene.Net.Documents.Field("path", dataDirectory, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO);
Field fContent = new Field("content", ReadTextFile(dataDirectory), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES);
doc.Add(fPath);
doc.Add(fContent);
我使用以下代码从Lucene索引文件中获取索引条款。
TermFreqVector[] vectors = IndexReader.Open(indexDirectory).GetTermFreqVectors(0);
foreach (Lucene.Net.Index.TermFreqVector vector in vectors)
{
String[] terms = vector.GetTerms();
foreach (String term in terms)
{
// loop through indexed terms
}
}
答案 0 :(得分:2)
您可以实施自己的分析器,或扩展StandardAnalyzer。
示例:
TokenFilter + Analyzer
public class MinTermLengthTokenFilter : TokenFilter
{
private int minTermLength;
private TermAttribute termAtt;
public MinTermLengthTokenFilter(int maxTermLength, TokenStream input)
: base(input)
{
this.minTermLength = maxTermLength;
termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
}
public override bool IncrementToken()
{
while (input.IncrementToken())
{
if (termAtt.TermLength() >= minTermLength)
{
return true;
}
}
return false;
}
}
public class MinTermLengthAnalyzer : StandardAnalyzer
{
private int minTermLength;
public MinTermLengthAnalyzer(int minTermLength)
:base()
{
this.minTermLength = minTermLength;
}
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
return new MinTermLengthTokenFilter(minTermLength, base.TokenStream(fieldName, reader));
}
public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
{
return new MinTermLengthTokenFilter(minTermLength, base.ReusableTokenStream(fieldName, reader));
}
}
索引:
FSDirectory dir = FSDirectory.GetDirectory("C:\\temp\\CFSTEST");
IndexWriter writer = new IndexWriter(dir, new MinTermLengthAnalyzer(5));
Document document = new Document();
document.Add(new Field(
"text",
"some sample text for demonstration",
Field.Store.YES,
Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.AddDocument(document);
writer.Close();
搜索:
var indexSearcher = new IndexSearcher(IndexReader.Open("C:\\temp\\CFSTEST"));
var results = indexSearcher.Search(new TermQuery(new Term("text", "demonstration")), null, 25);
foreach (var result in results.ScoreDocs)
{
TermFreqVector[] vectors = indexSearcher.GetIndexReader().GetTermFreqVectors(result.doc);
foreach (Lucene.Net.Index.TermFreqVector vector in vectors)
{
String[] terms = vector.GetTerms();
foreach (String term in terms)
{
Console.WriteLine(term);
}
}
}
indexSearcher.Close();
// outputs:
// demonstration
// sample