在Lucene.NET搜索中找不到某些单词

时间:2014-06-16 15:24:10

标签: html lucene.net

我将500多个html文档加载到索引中,大多数搜索都能正常运行。然而,他们中的一些人不会......例如" italic"或"粗体"。

public static void LoadTopics(string informPath)
{
    var path = new DirectoryInfo(Path.Combine(new FileInfo(AppDomain.CurrentDomain.BaseDirectory).Directory.FullName, "LuceneIndex"));
    if (!path.Exists)
    {
        path.Create();
        path.Refresh();
    }

    ISet<string> emptySet = new SortedSet<string>();

    Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30, emptySet);

    FSDirectory directory = new SimpleFSDirectory(path);
    IndexWriter iwriter = new IndexWriter(directory, analyzer, new IndexWriter.MaxFieldLength(140));

    // get list of I7 html documentation files
    List<Inform7Topic> topics = new List<Inform7Topic>();
    string[] docFiles = System.IO.Directory.GetFiles(informPath + @"\Documentation\", "doc*.html", SearchOption.TopDirectoryOnly);
    string[] rdocFiles = System.IO.Directory.GetFiles(informPath + @"\Documentation\", "rdoc*.html", SearchOption.TopDirectoryOnly);

    List<string> docs = new List<string>();
    docs.AddRange(docFiles);
    docs.AddRange(rdocFiles);

    foreach (string topic in docs)
    {
        string html = File.ReadAllText(topic);

        HtmlDocument hdoc = new HtmlDocument();
        hdoc.LoadHtml(html);

        string title = hdoc.DocumentNode.SelectSingleNode("//title").InnerText;

        int p1 = title.IndexOf(".");
        int p2 = title.IndexOf(".", p1 + 1);
        string chapter = title.Substring(0, p1);
        string section = title.Substring(p1 + 1, p2 - p1 - 1);

        if (!IsIndexedTopic(chapter, section))
        {
            Document doc = new Document();
            doc.Add(new Field("chapter", chapter, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("section", section, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("contents", html, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("filepath", topic, Field.Store.YES, Field.Index.ANALYZED));
            iwriter.AddDocument(doc);
        }
    }

    iwriter.Commit();
    iwriter.Dispose();
}

    public static List<Inform7Topic> SearchTopics(string searchTerm)
    {
        var path = new DirectoryInfo(Path.Combine(new FileInfo(AppDomain.CurrentDomain.BaseDirectory).Directory.FullName, "LuceneIndex"));
        if (!path.Exists)
        {
            path.Create();
            path.Refresh();
        }

        ISet<string> emptySet = new SortedSet<string>();
        FSDirectory directory = new SimpleFSDirectory(path);
        Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30, emptySet);

        // Now search the index:
        IndexReader ireader = DirectoryReader.Open(directory, true);
        IndexSearcher isearcher = new IndexSearcher(ireader);


        QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "contents", analyzer);
        Query query = parser.Parse(searchTerm);
        ScoreDoc[] hits = isearcher.Search(query, null, 1000).ScoreDocs;
        // Iterate through the results:
        List<Inform7Topic> viewModel = new List<Inform7Topic>();
        for (int i = 0; i < hits.Length; i++)
        {
            Document hitDoc = isearcher.Doc(hits[i].Doc);

            Inform7Topic topic = new Inform7Topic();
            string chapter = hitDoc.GetField("chapter").StringValue;
            string section = hitDoc.GetField("section").StringValue;
            string title = hitDoc.GetField("title").StringValue;
            string filePath = hitDoc.GetField("filepath").StringValue;
            string contents = hitDoc.GetField("contents").StringValue;

            topic.Chapter = chapter;
            topic.Section = section;
            topic.Title = title;
            topic.FilePath = filePath;
            topic.Score = hits[i].Score;

            viewModel.Add(topic);
        }
        ireader.Dispose();
        analyzer.Dispose();
        directory.Dispose();

        return viewModel;
    }

正如你所看到的,我甚至尝试过一个空的停止单词列表。我无法点击&#34; italic&#34;这个词是在44个html文件中。现在在某些情况下,单词是在javascript字符串中,但Lucene应该将整个文档视为一串文本,对吧?

我错过了什么?

0 个答案:

没有答案