Question

我正在为信息检索课程创建一个倒排索引，但无法弄清楚如何查看单词是否在我的嵌套哈希图中。

“内部”包含单词及其频率，而“ invertedIndex”包含其出现的文档的名称。

在处理搜索时，我试图查看定义为“查询”的用户输入是否在内部哈希图中。我很确定错误是由于代码底部的嵌套for循环引起的。

我的代码在下面。

   public class PositionalIndex extends Stemmer{

  // no more than this many input files needs to be processed
  final static int MAX_NUMBER_OF_INPUT_FILES = 100;

  // an array to hold Gutenberg corpus file names
  static String[] inputFileNames = new String[MAX_NUMBER_OF_INPUT_FILES];

  static int fileCount = 0;

  // loads all files names in the directory subtree into an array
  // violates good programming practice by accessing a global variable (inputFileNames)
  public static void listFilesInPath(final File path) {
      for (final File fileEntry : path.listFiles()) {
          if (fileEntry.isDirectory()) {
              listFilesInPath(fileEntry);
          }
          else if (fileEntry.getName().endsWith((".txt")))  {
              inputFileNames[fileCount++] = fileEntry.getPath();
          }
      }
      System.out.println("File count: " + fileCount);
  }

  public static void main(String[] args){

    // did the user provide correct number of command line arguments?
    // if not, print message and exit
    if (args.length != 1){
        System.err.println("Number of command line arguments must be 1");
        System.err.println("You have given " + args.length + " command line arguments");
        System.err.println("Incorrect usage. Program terminated");
        System.err.println("Correct usage: java Ngrams <path-to-input-files>");
        System.exit(1);
    }

    // extract input file name from command line arguments
    // this is the name of the file from the Gutenberg corpus
    String inputFileDirName = args[0];

    System.out.println("Input files directory path name is: " + inputFileDirName);

    // collects file names and write them to
    listFilesInPath(new File (inputFileDirName));

    // wordPattern specifies pattern for words using a regular expression
    Pattern wordPattern = Pattern.compile("[a-zA-Z]+");

    // wordMatcher finds words by spotting word word patterns with input
    Matcher wordMatcher;

    // a line read from file
    String line;

    // br for efficiently reading characters from an input stream
    BufferedReader br = null;

    // an extracted word from a line
    String word;

    // simplified version of porterStemmer
    Stemmer porterStemmer = new Stemmer();

    System.out.println("Processing files...");

    // create an instance of the Stemmer class
    Stemmer stemmer = new Stemmer();

    Map<String, Map<String, Integer>> invertedIndex = new HashMap<String, Map<String, Integer>>();
    Map<String, Integer> inner = new HashMap<String, Integer>();

    // process one file at a time
    for (int index = 0; index < fileCount; index++){

        // open the input file, read one line at a time, extract words
        // in the line, extract characters in a word, write words and
        // character counts to disk files
        try {
            // get a BufferedReader object, which encapsulates
            // access to a (disk) file
            br = new BufferedReader(new FileReader(inputFileNames[index]));

            // as long as we have more lines to process, read a line
            // the following line is doing two things: makes an assignment
            // and serves as a boolean expression for while test
            while ((line = br.readLine()) != null) {
                // process the line by extracting words using the wordPattern
                wordMatcher = wordPattern.matcher(line);

                // process one word at a time
                while ( wordMatcher.find() ) {
                    // extract the word
                    word = line.substring(wordMatcher.start(), wordMatcher.end());
                    word = word.toLowerCase();

                    //use Stemmer class to stem word & convert to lowercase
                    porterStemmer.stemWord(word);
                    if (!inner.containsKey(word)) {
                        inner.put(word, 1);
                    }
                    else
                    {
                      inner.put(word, inner.get(word) + 1);
                    }
              } // end one word at a time while
            } // end outer while
            invertedIndex.put(inputFileNames[index], inner);
            /*for(String x : inner.keySet()) {
                System.out.println(x);
              }*/
            inner.clear();
          } // end try
              catch (IOException ex) {
                System.err.println("File " + inputFileNames[index] + " not found. Program terminated.\n");
                System.exit(1);
                }
              } // end for
        System.out.print("Enter a query: ");
        Scanner kbd = new Scanner(System.in);
        String query = kbd.next();

        for(String fileName : invertedIndex.keySet()) {
            for(String wordInFile : invertedIndex.get(fileName).keySet())
            {
              if(wordInFile.equals(query))
              {
                System.out.println(query + " was found in document " + fileName);
              }
            }
          }


  }
}

Answer 1

尝试

for(String w : invertedIndex.keySet()) {
   Map<String, Integer> fileWordMap =  invertedIndex.get(w)
   if(fileWordMap.containsKey(query))
   {
       System.out.println(query + " was found in document " + w);
   }
}

或按照您的原始代码

for(String fileName : invertedIndex.keySet()) {
   for(String wordInFile : invertedIndex.get(fileName).keySet())
   {
       if(wordInFile.equals(query))
       {
          System.out.println(query + " was found in document " + fileName);
       }
    }
}

作为一个提示，尝试使用变量名可以告诉您代码在做什么：)如果仅使用随机变量名，则很容易混淆

Answer 2

您为什么调用：

inner.clear()

似乎每次都需要创建一个新的内部映射，然后将其添加到反向索引；而不是在数据丢失时清除它。

如何访问嵌套哈希图的关键内容

2 个答案: