我有一张MS Excel表格,包含以下栏目
title,cast,director,genre.
使用jxl库解析Excel工作表。索引工作正常,但是当我搜索时,我总是找到0次点击。我不知道我哪里出错了。代码如下:
import java.io.File;
import java.io.IOException;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class ExcelParser {
Directory index;
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
IndexWriterConfig c = new IndexWriterConfig(Version.LUCENE_31, analyzer);
public void parse(String filePath) throws IndexOutOfBoundsException,
BiffException, IOException {
index = FSDirectory.open(new File("d:\\index"));
Sheet contentSheet = Workbook.getWorkbook(new File(filePath)).getSheet(
0);
indexDocs(contentSheet);
}
void indexDocs(Sheet contentSheet) throws CorruptIndexException,
IOException {
String currentColumn = "";
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31,
analyzer);
IndexWriter writer = new IndexWriter(index, iwc);
for (int i = 0; i < contentSheet.getColumns(); i++) {
Cell[] xlCells = contentSheet.getColumn(i);
currentColumn = xlCells[0].getContents();
StringBuffer sb = new StringBuffer();
for (int j = 1; j < xlCells.length; j++) {
sb.append(xlCells[j].getContents() + " ");
}
addDoc(writer, sb.toString(), currentColumn);
}
writer.close();
}
void searcher(String querystr, String onField) throws ParseException,
CorruptIndexException, IOException {
IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File(
"d:\\index")));
Query q = new QueryParser(Version.LUCENE_31, onField, analyzer)
.parse(querystr);
int hitsPerPage = 2;
TopScoreDocCollector collector = TopScoreDocCollector.create(
hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
System.out.println("Found " + hits.length + " hits.");
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get("title"));
}
searcher.close();
}
private static void addDoc(IndexWriter w, String value, String fieldName)
throws IOException {
Document doc = new Document();
doc.add(new Field(fieldName, value, Field.Store.YES,
Field.Index.ANALYZED));
w.addDocument(doc);
}
public static void main(String[] args) throws IndexOutOfBoundsException,
BiffException, IOException {
ExcelParser p = new ExcelParser();
p.parse("d:\\movieList.xls");
try {
p.searcher("the", "title");
} catch (ParseException e) {
e.printStackTrace();
}
}
}
答案 0 :(得分:2)
您正在搜索默认停止过滤器列表中的the
一词。
将Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
更改为
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31, new HashSet());
清除停用词列表。