我有一个用solr创建的lucene索引。 lucene版本是3.6.1。
我在网上找到了一个读取lucene索引的java程序:
http://www.javacodegeeks.com/2010/05/introduction-to-apache-lucene-for-full.html
我为我的本地环境修改了程序,但它总是告诉我没有找到导致索引的查询的命中。在没有运气的程序后,我修改了代码以使用StandardAnalyzer而不是SimpleAnalyzer。没有运气。
以下是代码:
package com.javacodegeeks.lucene;
import java.io.File;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class StandardSearcher {
public static void main(String[] args) throws Exception {
File indexDir = new File("/path/to/solr/data/index/");
String query = "science";
int hits = 100;
StandardSearcher searcher = new StandardSearcher();
searcher.searchIndex(indexDir, query, hits);
}
private void searchIndex(File indexDir, String queryStr, int maxHits)
throws Exception {
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
Directory directory = FSDirectory.open(indexDir);
IndexSearcher searcher = new IndexSearcher(directory);
Query query = new QueryParser(Version.LUCENE_36, "title", analyzer).parse(queryStr);
TopDocs topDocs = searcher.search(query, maxHits);
ScoreDoc[] hits = topDocs.scoreDocs;
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println(d.get("filename"));
}
System.out.println("Found " + hits.length);
}
}
我做错了什么?通过solrconfig.xml查看我无法分析默认情况下哪个分析器solr使用。这就是我尝试使用SimpleAnalyzer和StandardAnalyzer的原因。
非常感谢有关如何调试此建议的建议。
更新:以下是我的架构中的字段:
<field name="metaDataUrl" type="string" indexed="true" stored="true" required="true"/>
<field name="title" type="text" stored="true" indexed="true"/>
<field name="snippet" type="text" indexed="true" stored="true"/>
<field name="rest" type="string" stored="true" indexed="false" multiValued="true"/>
<field name="date_indexed" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
<field name="all" type="text" stored="false" indexed="true" multiValued="true"/>
而且,这是schema.xml中的fieldType文本的XML:
<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
Synonyms and stopwords are customized by external files, and stemming is enabled.
-->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
add enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldType>
答案 0 :(得分:1)
您需要使用标记化器和索引时使用的过滤器(如fieldType xml的索引部分中所定义)构建自定义分析器。将该自定义分析器作为参数传递给搜索器,然后搜索应该可以正常工作。 SnowballPorterFilter会阻止“科学”吗?可能是......
有关构建自定义分析器的详细信息,请参阅http://whiteboardjunkie.wordpress.com/tag/custom-analyzer/。您只需在tokenstream()
中调用一个接一个的过滤器此外,您可以使用luke(http://code.google.com/p/luke/)检查索引,并查看标题字段中是否包含任何包含“science”的文档。
答案 1 :(得分:-1)
一位同事略微改变了我的代码,看起来像下面的代码。他还建议我搜索词干。这种方法有效,我现在从针对该solr构建的Lucene索引的搜索中获得结果。这段代码仍然需要工作,但我发布它作为一个概念验证,我希望对其他人有用。
import java.io.File;
import java.util.List;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class SimpleSearcher {
public static void main(String[] args) throws Exception {
File indexDir = new File("/path/to/solr/data/index/");
int hits = 100;
SimpleSearcher searcher = new SimpleSearcher();
searcher.searchIndex(indexDir, args[0], hits);
}
private void searchIndex(File indexDir, String queryStr, int maxHits)
throws Exception {
Directory directory = FSDirectory.open(indexDir);
IndexSearcher searcher = new IndexSearcher(directory);
QueryParser parser = new QueryParser(Version.LUCENE_35,
"title", new SimpleAnalyzer());
Query query = parser.parse(queryStr);
TopDocs topDocs = searcher.search(query, maxHits);
ScoreDoc[] hits = topDocs.scoreDocs;
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
List<Fieldable> fields = d.getFields();
System.out.println( (i+1) + ". ==========================================================");
for ( Fieldable field : fields ) {
if (field.isStored()) {
System.out.println(" >> " + field.name() + " - " + d.get(field.name()));
}
}
}
System.out.println("Found " + hits.length);
}
}