我正在尝试使用Lucene 6.2索引来自MySQL的数据(在Scala中使用Slick)。这是下面的代码
package oc.api.services
/**
* Created by sujit on 9/7/16.
*/
import org.apache.lucene.document._
import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.index._
import org.apache.lucene.search.IndexSearcher
import java.io.{File, IOException}
import java.nio.file.Paths
import akka.actor.ActorSystem
import akka.event.{Logging, LoggingAdapter}
import akka.stream.ActorMaterializer
import oc.api.utils.{Config, DatabaseService}
import org.apache.lucene.analysis.core.KeywordAnalyzer
import org.apache.lucene.index.IndexWriterConfig.OpenMode
import org.apache.lucene.queryparser.classic.{MultiFieldQueryParser, QueryParser}
import org.apache.lucene.store.FSDirectory
import scala.concurrent.ExecutionContext
class Indexer extends Config {
implicit val actorSystem = ActorSystem()
implicit val executor: ExecutionContext = actorSystem.dispatcher
implicit val log: LoggingAdapter = Logging(actorSystem, getClass)
implicit val materializer: ActorMaterializer = ActorMaterializer()
val databaseService = new DatabaseService(jdbcUrl, dbUser, dbPassword)
val notesService = new NotesService(databaseService)
def setIndex = {
val IndexStoreDir = Paths.get("/var/www/html/LuceneIndex")
val analyzer = new StandardAnalyzer()
val writerConfig = new IndexWriterConfig(analyzer)
writerConfig.setOpenMode(OpenMode.CREATE)
writerConfig.setRAMBufferSizeMB(500)
val directory = FSDirectory.open(IndexStoreDir)
var writer = new IndexWriter(directory, writerConfig)
val notes = notesService.getNotes() //Gets all notes from slick. Data is coming in getNotes()
var doc = new Document()
var count = 0
val stringType = new FieldType()
notes.map(_.foreach{
case(note) =>
doc = new Document()
var field = new TextField("id", note.title, Field.Store.YES)
doc.add(field)
field = new TextField("title", note.title, Field.Store.YES)
doc.add(field)
field = new TextField("teaser", note.teaser, Field.Store.YES)
doc.add(field)
field = new TextField("description", note.description, Field.Store.YES)
doc.add(field)
writer.addDocument(doc)
writer.commit()
})
//
}
def search(keyword: String) = {
val IndexStoreDir = Paths.get("/var/www/html/LuceneIndex")
var directoryReader = DirectoryReader.open(FSDirectory.open(IndexStoreDir))
val analyzer = new StandardAnalyzer()
val searcher = new IndexSearcher(directoryReader)
val fieldsToSearch = Array("title", "teaser", "description")
val mqp = new MultiFieldQueryParser(fieldsToSearch,analyzer) //QueryParser("title", analyzer) //MultiFieldQueryParser(filesToSearch,analyzer)
val query = mqp.parse(keyword)
val hits = searcher.search(query,500)
val scoreDoc = hits.scoreDocs
scoreDoc.foreach( docs => {
val doc = searcher.doc(docs.doc)
println("***** Document Found: ")
println("***** Title: ")
println(doc.get("title"))
println("***** Teaser: ")
println(doc.get("teaser"))
println("***** Description: ")
println(doc.get("description"))
})
println("****** Results Found: " + hits.totalHits)
}
}
object Indexer extends App {
val index = new Indexer
//index.setIndex
index.search("Donec")
}
setIndex函数在提供的Path中按预期工作。但是当我基于关键字搜索索引时,它会抛出0结果。搜索功能有什么错误吗?怎么解决这个问题?
如何以优化使用Thread创建索引的方式编写上述代码?
答案 0 :(得分:1)
最后我找到了研究很长时间的答案:
使用线程:
<textarea>
使用Scala Future:
def setI = {
val NUM_THREADS = Runtime.getRuntime().availableProcessors()
val curNotes = notesService.getNotes()
val totalRows = Await.result(curNotes, Duration.Inf).length
var totalPages = totalRows / NUM_THREADS
if(totalPages != totalPages.toInt){
totalPages = totalPages + 1
}
var tmp = Await.result(curNotes, Duration.Inf).grouped(totalPages).toList
val rows = tmp(tmp.length-2) ++ tmp.last
val threads = Array.ofDim[Index](NUM_THREADS)
val IndexStoreDir = Paths.get("/var/www/html/LuceneIndex")
val analyzer = new StandardAnalyzer()
val writerConfig = new IndexWriterConfig(analyzer)
writerConfig.setOpenMode(OpenMode.CREATE_OR_APPEND)
writerConfig.setRAMBufferSizeMB(500)
.setMaxBufferedDocs(10)
.setMergeScheduler(new ConcurrentMergeScheduler())
val directory = FSDirectory.open(IndexStoreDir)
val writer = new IndexWriter(directory, writerConfig)
var count = 0
for(i <- 0 until tmp.length - 2){
count = i
threads(i) = new Index(tmp(i), writer, i)
}
count = count + 1
threads(count) = new Index(rows, writer, count)
for (i <- 0 until NUM_THREADS) {
println("Thread :" + threads(i).getName + " => " + (i + 1) + " Started!")
threads(i).start()
}
}