在lucene

时间:2017-07-20 16:55:00

标签: solr lucene full-text-search pylucene

我正在使用pylucene来构建和搜索反向文本索引。我构建了这个类(不要害怕python代码,pylucene暴露了与java相同的函数):

import os, re, sys, lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField, FieldType
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser

class LuceneCtrl():

    def __init__(self, index_dir):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.index_dir = index_dir
        self.dir_wrapper = SimpleFSDirectory(Paths.get(self.index_dir))
        self.analyzer = StandardAnalyzer()
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 5000)

    def index_documents(self, docs):
        writer_config = IndexWriterConfig(self.analyzer)
        writer = IndexWriter(self.dir_wrapper, writer_config)
        for content, id in docs:
            doc = Document()
            doc.add(Field("content", content, self.TermsField.TYPE_STORED))
            doc.add(Field("id", id, StringField.TYPE_STORED))
            writer.addDocument(doc)
        writer.commit()
        writer.close()

    def query_index(self, query_terms, n_top=10):
        reader = DirectoryReader.open(self.dir_wrapper)
        searcher = IndexSearcher(reader)
        parser = QueryParser("content", self.analyzer)
        parser.setDefaultOperator(QueryParser.Operator.AND)
        query = parser.parse(query_terms)
        scoreDocs = searcher.search(query, n_top).scoreDocs
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            print(table)
        reader.close()

我是lucene的新手,我想知道每次运行index_documentsquery_index函数时创建编写器和阅读器的最佳程度。我不能在课堂上存储更多信息吗?我试图将读取器和编写器保存为属性,但它会使进程崩溃。

编辑:我正在使用的最后一堂课

import os, re, sys, lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField, FieldType
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser

class LuceneCtrl():

    def __init__(self, index_dir):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.index_dir = index_dir
        self.dir_wrapper_reader = SimpleFSDirectory(Paths.get(self.index_dir))
        self.dir_wrapper_writer = SimpleFSDirectory(Paths.get(self.index_dir))
        self.analyzer = StandardAnalyzer()
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 5000)
        self.reader = None
        self.searcher = None
        self.writer_config = IndexWriterConfig(self.analyzer)
        self.writer = IndexWriter(self.dir_wrapper_writer, self.writer_config)

    def index_documents(self, docs):
        for text, id_ in docs:
            doc = Document()
            doc.add(Field("text", text, TextField.TYPE_STORED))
            doc.add(Field("id", id_, StringField.TYPE_STORED))
            self.writer.addDocument(doc)
        self.writer.commit()

    def query_index(self, tokens, operator='AND',n_top=10):
        if self.reader is None:
            self.reader = DirectoryReader.open(self.dir_wrapper_reader)
            self.searcher = IndexSearcher(self.reader)
        else:
            new_reader = DirectoryReader.openIfChanged(self.reader)
            if new_reader:
                self.reader = new_reader
                self.searcher = IndexSearcher(self.reader)
        parser = QueryParser("text", self.analyzer)
        if operator.lower() in ['and', '+']:
            parser.setDefaultOperator(QueryParser.Operator.AND)
        else:
            parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(tokens)
        scoreDocs = self.searcher.search(query, n_top).scoreDocs
        return scoreDocs

0 个答案:

没有答案