无法在lucene中获取搜索文本

时间:2013-10-03 12:09:50

标签: lucene

我特此粘贴以下代码,

    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
            import org.apache.lucene.document.Document;
            import org.apache.lucene.document.Field;
            import org.apache.lucene.document.StringField; 
            import org.apache.lucene.index.DirectoryReader;
            import org.apache.lucene.index.IndexReader;
            import org.apache.lucene.index.IndexWriter;
            import org.apache.lucene.index.IndexWriterConfig;
            import org.apache.lucene.queryparser.classic.QueryParser;
            import org.apache.lucene.search.IndexSearcher;
            import org.apache.lucene.search.Query;
            import org.apache.lucene.search.ScoreDoc;
            import org.apache.lucene.search.TopScoreDocCollector;
            import org.apache.lucene.store.FSDirectory;
            import org.apache.lucene.util.Version;

            import java.io.*;
            import java.util.ArrayList;

            /**
             * This terminal application creates an Apache Lucene index in a folder and adds files into this index
             * based on the input of the user.
             */
            public class TextFileIndexer {
              private static StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
              private Analyzer anal = new WhitespaceAnalyzer(Version.LUCENE_44);
              private IndexWriter writer;
              private ArrayList<File> queue = new ArrayList<File>();


              public static void main(String[] args) throws IOException {
                System.out.println("Enter the path where the index will be created: (e.g. /tmp/index or c:/temp/index)");

                String indexLocation = null;
                BufferedReader br = new BufferedReader(
                        new InputStreamReader(System.in));
                String s = br.readLine();

                TextFileIndexer indexer = null;
                try {
                  indexLocation = s;
                  indexer = new TextFileIndexer(s);
                } catch (Exception ex) {
                  System.out.println("Cannot create index..." + ex.getMessage());
                  System.exit(-1);
                }

                //===================================================
                //read input from user until he enters q for quit
                //===================================================
                while (!s.equalsIgnoreCase("q")) {
                  try {
                    System.out.println("Enter the full path to add into the index (q=quit): (e.g. /home/ron/mydir or c:\\Users\\ron\\mydir)");
                    System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
                    s = br.readLine();
                    if (s.equalsIgnoreCase("q")) {
                      break;
                    }

                    //try to add file into the index
                    indexer.indexFileOrDirectory(s);
                  } catch (Exception e) {
                    System.out.println("Error indexing " + s + " : " + e.getMessage());
                  }
                }

                //===================================================
                //after adding, we always have to call the
                //closeIndex, otherwise the index is not created    
                //===================================================
                indexer.closeIndex();

                //=========================================================
                // Now search
                //=========================================================
                IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
                IndexSearcher searcher = new IndexSearcher(reader);
                TopScoreDocCollector collector = TopScoreDocCollector.create(5, true);

                s = "";
                while (!s.equalsIgnoreCase("q")) {
                  try {
                    System.out.println("Enter the search query (q=quit):");
                    s = br.readLine();
                    if (s.equalsIgnoreCase("q")) {
                      break;
                    }
                    Query q = new QueryParser(Version.LUCENE_44, "contents", analyzer).parse(s);
                    searcher.search(q, collector);
                    ScoreDoc[] hits = collector.topDocs().scoreDocs;

                    // 4. display results
                    System.out.println("Found " + hits.length + " hits.");
                    for(int i=0;i<hits.length;++i) {
                      int docId = hits[i].doc;
                      Document d = searcher.doc(docId);
                      System.out.println((i + 1) + ". " + d.get("path") + " score=" + hits[i].score);
                    }

                  } catch (Exception e) {
                    System.out.println("Error searching " + s + " : " + e.getMessage());
                  }
                }

              }

              /**
               * Constructor
               * @param indexDir the name of the folder in which the index should be created
               * @throws java.io.IOException when exception creating index.
               */
              TextFileIndexer(String indexDir) throws IOException {
                // the boolean true parameter means to create a new index everytime, 
                // potentially overwriting any existing files there.
                FSDirectory dir = FSDirectory.open(new File(indexDir));


                IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer);

                writer = new IndexWriter(dir, config);
              }

              /**
               * Indexes a file or directory
               * @param fileName the name of a text file or a folder we wish to add to the index
               * @throws java.io.IOException when exception
               */
              public void indexFileOrDirectory(String fileName) throws IOException {
                //===================================================
                //gets the list of files in a folder (if user has submitted
                //the name of a folder) or gets a single file name (is user
                //has submitted only the file name) 
                //===================================================
                addFiles(new File(fileName));

                int originalNumDocs = writer.numDocs();
                for (File f : queue) {
                  FileReader fr = null;
                  try {
                    Document doc = new Document();

                    //===================================================
                    // add contents of file
                    //===================================================
                    fr = new FileReader(f);

            //        doc.add(new TextField("contents", fr));
                    doc.add(new StringField("path", f.getPath(), Field.Store.YES));
                    doc.add(new StringField("filename", f.getName(), Field.Store.YES));

                    writer.addDocument(doc);
                    System.out.println("Added: " + f);



                    BufferedReader br = new BufferedReader(new FileReader(fileName));
                    Field field = new StringField("contents", br.readLine().toString(),
                            Field.Store.YES);
                    doc.add(field);
                    writer.addDocument(doc);


                  } catch (Exception e) {
                    System.out.println("Could not add: " + f);
                  } finally {
                    fr.close();
                  }
                }

                int newNumDocs = writer.numDocs();
                System.out.println("");
                System.out.println("************************");
                System.out.println((newNumDocs - originalNumDocs) + " documents added.");
                System.out.println("************************");

                queue.clear();
              }

              private void addFiles(File file) {

                if (!file.exists()) {
                  System.out.println(file + " does not exist.");
                }
                if (file.isDirectory()) {
                  for (File f : file.listFiles()) {
                    addFiles(f);
                  }
                } else {
                  String filename = file.getName().toLowerCase();
                  //===================================================
                  // Only index text files
                  //===================================================
                  if (filename.endsWith(".htm") || filename.endsWith(".html") || 
                          filename.endsWith(".xml") || filename.endsWith(".txt")  || filename.endsWith(".pdf") ) {
                    queue.add(file);
                  } else {
                    System.out.println("Skipped " + filename);
                  }
                }
              }

              /**
               * Close the index.
               * @throws java.io.IOException when exception closing
               */
              public void closeIndex() throws IOException {
                writer.close();
              }
            }

但是,当我在文件中搜索特定的字符串时。我找不到String。输出如下,

Enter the path where the index will be created: (e.g. /tmp/index or c:/temp/index)
D:/svn/phase2/JavaSource/test/test/
Enter the full path to add into the index (q=quit): (e.g. /home/ron/mydir or c:\Users\ron\mydir)
[Acceptable file types: .xml, .html, .html, .txt]
D:/svn/phase2/JavaSource/test/test
Skipped segments.gen
Skipped segments_1
Skipped write.lock
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\demo.xml
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\exe.xml
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\Fruit.XML
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\Influence_People.pdf
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\new.html
Added fileName : D:/svn/phase2/JavaSource/test/test
Added: D:\svn\phase2\JavaSource\test\test\Toy.xml

************************
6 documents added.
************************
Enter the full path to add into the index (q=quit): (e.g. /home/ron/mydir or c:\Users\ron\mydir)
[Acceptable file types: .xml, .html, .html, .txt]
q
Enter the search query (q=quit):
for
Entered String is   :  for
fieldName =for
Found : 0 hits.
Enter the search query (q=quit):
i
Entered String is   :  i
Error searching i : this IndexReader is closed
Enter the search query (q=quit):
q
Entered String is   :  q

2 个答案:

答案 0 :(得分:0)

“for”和“i”默认都是StandardAnalyzer中的停用词,因此无法进行搜索。默认停用词的完整列表是:

   "a", "an", "and", "are", "as", "at", "be", "but", "by",
   "for", "if", "in", "into", "is", "it",
   "no", "not", "of", "on", "or", "such",
   "that", "the", "their", "then", "there", "these",
   "they", "this", "to", "was", "will", "with"

似乎可能还有其他问题在起作用。不知道为什么你的读者会因第二次查询而被关闭。我不知道输出“fieldName = for”来自哪里。但希望这会让你开始调试。

答案 1 :(得分:0)

您是否尝试过在Luke中调试代码? (Lucene Index Toolbox) http://code.google.com/p/luke/

Luke非常善于使用不同的分析器执行搜索,检查索引存储,了解基于搜索等方式对文档进行评分。它可以帮助消除搜索代码的任何问题,因为它可以直接处理索引文件。

Luke适用于Lucene的Java和.NET版本。