我将500多个html文档加载到索引中,大多数搜索都能正常运行。然而,他们中的一些人不会......例如" italic"或"粗体"。
public static void LoadTopics(string informPath)
{
var path = new DirectoryInfo(Path.Combine(new FileInfo(AppDomain.CurrentDomain.BaseDirectory).Directory.FullName, "LuceneIndex"));
if (!path.Exists)
{
path.Create();
path.Refresh();
}
ISet<string> emptySet = new SortedSet<string>();
Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30, emptySet);
FSDirectory directory = new SimpleFSDirectory(path);
IndexWriter iwriter = new IndexWriter(directory, analyzer, new IndexWriter.MaxFieldLength(140));
// get list of I7 html documentation files
List<Inform7Topic> topics = new List<Inform7Topic>();
string[] docFiles = System.IO.Directory.GetFiles(informPath + @"\Documentation\", "doc*.html", SearchOption.TopDirectoryOnly);
string[] rdocFiles = System.IO.Directory.GetFiles(informPath + @"\Documentation\", "rdoc*.html", SearchOption.TopDirectoryOnly);
List<string> docs = new List<string>();
docs.AddRange(docFiles);
docs.AddRange(rdocFiles);
foreach (string topic in docs)
{
string html = File.ReadAllText(topic);
HtmlDocument hdoc = new HtmlDocument();
hdoc.LoadHtml(html);
string title = hdoc.DocumentNode.SelectSingleNode("//title").InnerText;
int p1 = title.IndexOf(".");
int p2 = title.IndexOf(".", p1 + 1);
string chapter = title.Substring(0, p1);
string section = title.Substring(p1 + 1, p2 - p1 - 1);
if (!IsIndexedTopic(chapter, section))
{
Document doc = new Document();
doc.Add(new Field("chapter", chapter, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("section", section, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("contents", html, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("filepath", topic, Field.Store.YES, Field.Index.ANALYZED));
iwriter.AddDocument(doc);
}
}
iwriter.Commit();
iwriter.Dispose();
}
public static List<Inform7Topic> SearchTopics(string searchTerm)
{
var path = new DirectoryInfo(Path.Combine(new FileInfo(AppDomain.CurrentDomain.BaseDirectory).Directory.FullName, "LuceneIndex"));
if (!path.Exists)
{
path.Create();
path.Refresh();
}
ISet<string> emptySet = new SortedSet<string>();
FSDirectory directory = new SimpleFSDirectory(path);
Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30, emptySet);
// Now search the index:
IndexReader ireader = DirectoryReader.Open(directory, true);
IndexSearcher isearcher = new IndexSearcher(ireader);
QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "contents", analyzer);
Query query = parser.Parse(searchTerm);
ScoreDoc[] hits = isearcher.Search(query, null, 1000).ScoreDocs;
// Iterate through the results:
List<Inform7Topic> viewModel = new List<Inform7Topic>();
for (int i = 0; i < hits.Length; i++)
{
Document hitDoc = isearcher.Doc(hits[i].Doc);
Inform7Topic topic = new Inform7Topic();
string chapter = hitDoc.GetField("chapter").StringValue;
string section = hitDoc.GetField("section").StringValue;
string title = hitDoc.GetField("title").StringValue;
string filePath = hitDoc.GetField("filepath").StringValue;
string contents = hitDoc.GetField("contents").StringValue;
topic.Chapter = chapter;
topic.Section = section;
topic.Title = title;
topic.FilePath = filePath;
topic.Score = hits[i].Score;
viewModel.Add(topic);
}
ireader.Dispose();
analyzer.Dispose();
directory.Dispose();
return viewModel;
}
正如你所看到的,我甚至尝试过一个空的停止单词列表。我无法点击&#34; italic&#34;这个词是在44个html文件中。现在在某些情况下,单词是在javascript字符串中,但Lucene应该将整个文档视为一串文本,对吧?
我错过了什么?