Lucene查询不返回标准分析器上的命中

时间:2014-07-18 18:48:26

标签: c# asp.net lucene

我有一个文件名thatfeelwhen.pdf,当我搜索使用像“那个”或“感觉”这样的词时,我不会受到打击,当我输入“when”或整个文件名时。我正在使用标准分析仪。我怎样才能让Lucene的搜索者匹配所有内容?我的搜索查询似乎与文件中的内容匹配,但不匹配文件名。

public partial class _Default : Page
{
    Directory finalDirectory = null;
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);

其他方法中的代码:

private static void AddTextToIndex(string filename, string pdfBody, IndexWriter writer)
    {
        Document doc = new Document();
        doc.Add(new Field("fileName", filename.ToString(), Field.Store.YES, Field.Index.ANALYZED));
        doc.Add(new Field("pdfBody", pdfBody.ToString(), Field.Store.NO, Field.Index.ANALYZED));
        writer.AddDocument(doc);
    }

    private static Directory buildIndex(Analyzer analyzer)
    {
        string[] syllabusFiles = System.IO.Directory.GetFiles(@"C:\mywebsite\files\forms");
        Directory directory = FSDirectory.Open(new DirectoryInfo(@"C:\mywebsite\files\LuceneIndex"));           
        var writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);

        int j = 0;
        while (j < syllabusFiles.Length)
        {
            string pdfTextExtracted = pdfText(syllabusFiles[j]);
            string fileNameOnly = syllabusFiles[j].Replace("C:\\website\\files\\forms", "");
            AddTextToIndex(fileNameOnly, pdfTextExtracted, writer);
            j++;
        }
        writer.Optimize();
        writer.Dispose();
        return directory;
    }

    protected void txtBoxSearchPDF_Click(object sender, EventArgs e)
    {
        if (txtBoxSearchString.Text == "")
        {
            lblNoSearchString.Visible = true;               
        }
        else if (txtBoxSearchString.Text == "build_index")
        {
            this.finalDirectory = buildIndex(this.analyzer);
        }
        else
        {
            //searching PDF text
            lblNoSearchString.Visible = false;
            StringBuilder sb = new StringBuilder();
            this.finalDirectory = FSDirectory.Open(new DirectoryInfo(@"C:\mywebsite\files\LuceneIndex"));
            IndexReader indexReader = IndexReader.Open(this.finalDirectory, true);
            Searcher indexSearch = new IndexSearcher(indexReader);
            string searchQuery = txtBoxSearchString.Text;
            var fields = new[] { "fileName", "pdfBody" };
            var queryParser = new MultiFieldQueryParser(Version.LUCENE_30, fields, this.analyzer);
            Query query;
            try
            {
                query = queryParser.Parse(searchQuery.Trim());
            }
            catch (ParseException)
            {
                query = queryParser.Parse(QueryParser.Escape(searchQuery.Trim()));
            }
            TopDocs resultDocs = indexSearch.Search(query, indexReader.MaxDoc);                

            var hits = resultDocs.ScoreDocs;
            foreach (var hit in hits)
            {
                var documentFromSearcher = indexSearch.Doc(hit.Doc);
                string getResult = documentFromSearcher.Get("fileName");
                string formattedResult = getResult.Replace(" ", "%20");
                sb.AppendLine(@"<a href=https://website.com/search/forms/" + formattedResult+ ">" + getResult+"</a>");
                sb.AppendLine("<br>");
            }

1 个答案:

答案 0 :(得分:1)

我选择使用Analyzer analyzer = new SingleCharTokenAnalyzer();并获得更好的结果。

我尝试过简单,标准,空白和关键字分析器,但没有一个真正适合我的需求,而无需通过创建额外的工作来定制它们。