我正在尝试在我的应用程序中实现luecene搜索引擎。
我正在使用lucene 5.4.1
我已经成功实现了lucene的wildequeries和普通查询。
但我主要关注的是使用正则表达式模式搜索文本文件中的特定文本。
Index Writer代码:
public IndexWriter generateIndex(String docsPath) throws IOException {
String indexPath = System.getProperty("java.io.tmpdir") +File.separator+"indexDirectory";
if (indexPath == null) {
throw new IOException("System property 'java.io.tmpdir' does not specify a tmp dir");
}
File tmpDir = new File(indexPath);
if (!tmpDir.exists()) {
boolean created = tmpDir.mkdirs();
if (!created) {
throw new IOException("Unable to create tmp dir " + tmpDir);
}
}
boolean create = true;
final Path docDir = Paths.get(docsPath);
if (!Files.isReadable(docDir)) {
System.out.println("Document directory '" + docDir.toAbsolutePath()
+ "' does not exist or is not readable, please check the path");
System.exit(1);
}
Date start = new Date();
try {
System.out.println("Indexing to directory '" + indexPath + "'...");
Directory dir = FSDirectory.open(Paths.get(indexPath));
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
if (create) {
iwc.setOpenMode(OpenMode.CREATE);
} else {
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
IndexWriter writer = new IndexWriter(dir, iwc);
indexDocs(writer, docDir);
setIndexWriter(writer);
Date end = new Date();
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
writer.close();
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
return getIndexWriter();
}
static void indexDocs(final IndexWriter writer, Path path) throws IOException {
if (Files.isDirectory(path)) {
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
try {
indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
} catch (IOException ignore) {
// don't index files that can't be read.
}
return FileVisitResult.CONTINUE;
}
});
} else {
indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
}
}
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
try (InputStream stream = Files.newInputStream(file)) {
Document doc = new Document();
Field pathField = new StringField("path", file.toString(), Field.Store.NO);
doc.add(pathField);
doc.add(new LongField("modified", lastModified, Field.Store.NO));
doc.add(new TextField("contents",
new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
System.out.println("adding " + file);
writer.addDocument(doc);
} else {
System.out.println("updating " + file);
writer.updateDocument(new Term("path", file.toString()), doc);
}
}
}
索引搜索代码:
public IndexReader searchExecutor(String index, String queryString, RegexCapabilities capability) throws Exception {
String field = "contents";
String queries = null;
boolean raw = false;
int hitsPerPage = Integer.MAX_VALUE;
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
BufferedReader in = null;
Query q = new RegexpQuery(new Term("text", queryString));
q = q.rewrite(reader);
RegexQuery query = new RegexQuery(new Term("\\s*(FIND|find)"));
if (capability != null)
query.setRegexImplementation(capability);
System.out.println("Searching for: " + query.toString(field));
searcher.search(query, null, 1000);
doSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);
//reader.close();
return reader;
}
public static void doSearch(BufferedReader in, IndexSearcher searcher, Query query, int hitsPerPage, boolean raw,
boolean interactive)
throws IOException {
TopDocs results = searcher.search(query, 5 * hitsPerPage);
ScoreDoc[] hits = results.scoreDocs;
//generateIndex.deleteDocuments(query);
//generateIndex.getDirectory();
// TermsEnum.totalTermFreq();
int numTotalHits = results.totalHits;
System.out.println(numTotalHits + " total matching documents");
int start = 0;
int end = Math.min(numTotalHits, hitsPerPage);
for (int i = start; i < end; i++) {
Document doc = searcher.doc(hits[i].doc);
String path = doc.get("path");
File file = new File(path);
if (path != null) {
System.out.println((i + 1) + ". " + path);
String title = doc.get("title");
if (title != null) {
System.out.println(" Title: " + doc.get("title"));
}
} else {
System.out.println((i + 1) + ". " + "No path for this document");
}
}
}
请帮忙。
答案 0 :(得分:1)
您的问题是关于在lucene中使用正则表达式进行搜索。
RegexQuery
,请尝试使用RegexpQuery
\s*
开头,但您不使用KeywordTokenizer。大多数其他标记器将删除(又名&#34;拆分为&#34;)空白- &GT;阅读Supported RegExp syntax和syntax in ES以及TestRegexpRandom (test class)并在您的索引上使用https://github.com/DmitryKey/luke。