我正在使用pylucene来构建和搜索反向文本索引。我构建了这个类(不要害怕python代码,pylucene暴露了与java相同的函数):
import os, re, sys, lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField, FieldType
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
class LuceneCtrl():
def __init__(self, index_dir):
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
self.index_dir = index_dir
self.dir_wrapper = SimpleFSDirectory(Paths.get(self.index_dir))
self.analyzer = StandardAnalyzer()
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 5000)
def index_documents(self, docs):
writer_config = IndexWriterConfig(self.analyzer)
writer = IndexWriter(self.dir_wrapper, writer_config)
for content, id in docs:
doc = Document()
doc.add(Field("content", content, self.TermsField.TYPE_STORED))
doc.add(Field("id", id, StringField.TYPE_STORED))
writer.addDocument(doc)
writer.commit()
writer.close()
def query_index(self, query_terms, n_top=10):
reader = DirectoryReader.open(self.dir_wrapper)
searcher = IndexSearcher(reader)
parser = QueryParser("content", self.analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(query_terms)
scoreDocs = searcher.search(query, n_top).scoreDocs
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
table = dict((field.name(), field.stringValue()) for field in doc.getFields())
print(table)
reader.close()
我是lucene的新手,我想知道每次运行index_documents
和query_index
函数时创建编写器和阅读器的最佳程度。我不能在课堂上存储更多信息吗?我试图将读取器和编写器保存为属性,但它会使进程崩溃。
编辑:我正在使用的最后一堂课
import os, re, sys, lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField, FieldType
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
class LuceneCtrl():
def __init__(self, index_dir):
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
self.index_dir = index_dir
self.dir_wrapper_reader = SimpleFSDirectory(Paths.get(self.index_dir))
self.dir_wrapper_writer = SimpleFSDirectory(Paths.get(self.index_dir))
self.analyzer = StandardAnalyzer()
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 5000)
self.reader = None
self.searcher = None
self.writer_config = IndexWriterConfig(self.analyzer)
self.writer = IndexWriter(self.dir_wrapper_writer, self.writer_config)
def index_documents(self, docs):
for text, id_ in docs:
doc = Document()
doc.add(Field("text", text, TextField.TYPE_STORED))
doc.add(Field("id", id_, StringField.TYPE_STORED))
self.writer.addDocument(doc)
self.writer.commit()
def query_index(self, tokens, operator='AND',n_top=10):
if self.reader is None:
self.reader = DirectoryReader.open(self.dir_wrapper_reader)
self.searcher = IndexSearcher(self.reader)
else:
new_reader = DirectoryReader.openIfChanged(self.reader)
if new_reader:
self.reader = new_reader
self.searcher = IndexSearcher(self.reader)
parser = QueryParser("text", self.analyzer)
if operator.lower() in ['and', '+']:
parser.setDefaultOperator(QueryParser.Operator.AND)
else:
parser.setDefaultOperator(QueryParser.Operator.OR)
query = parser.parse(tokens)
scoreDocs = self.searcher.search(query, n_top).scoreDocs
return scoreDocs