Java Indexer Speed

时间:2016-02-09 06:24:24

标签: java time-complexity information-retrieval indexer

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import org.apache.commons.io.FileUtils;

public class indexer {

    @SuppressWarnings("unchecked")
    public static void main(String[] args) throws IOException{

        HindiStemmerLight shl = new      
        //HindiStemmerLight();                                      
        Scanner in1 = new Scanner(System.in);
        System.out.println("");
        System.out.println("Enter the File Path");

        String path= in1.next();


        File folder = new File(path);                               
        File[] listOfFiles = folder.listFiles();
        ArrayList<String> array = new ArrayList<String>();
        int count1 = 0 ;
        ArrayList<String> stopwords = new   
        ArrayList<String>();                                                                  File files = new File("/home/gaurav/stop-words_hindi_1_hi.txt");
        String stopWordsFile=FileUtils.readFileToString(files);
        String[] stopWords = stopWordsFile.split(" ");
        for(String str:stopWords){
            stopwords.add(str);
        }
        System.out.println("");

       for (int i = 0; i <listOfFiles.length; i++) {                                 //Reading the contents of each file


              File file = listOfFiles[i];            

              if (file.isFile() && file.getName().endsWith(".txt")) {
                String content = FileUtils.readFileToString(file);                      //storing the contents of files in content

        String[] a=content.split("");                                        
        for(String s:a){                   
                  s= s.trim();                      
                  if(stopwords.contains(s)){
                  }
                  else{
                    //shl.stem(s);                                                          //applying the hindi stemmer on each word
                   // if(!array.contains(s))                                            // storing each word encountered into arraylist - array
                   array.add(s);
                  }
                }

              }
        }

       Arrays.sort(listOfFiles, new Comparator()
       {
           @Override
           public int compare(Object f1, Object f2) {
               return ((File) f1).getName().compareTo(((File) f2).getName());
           }
       });


       Map<String, ArrayList<HashMap<String, Integer>>> words = new TreeMap<String, ArrayList<HashMap<String, Integer>>>();
       Collections.sort(array);
        for(int i =0 ; i<array.size();i++){
            String s = array.get(i);
            ArrayList<HashMap<String, Integer>> Hash = new ArrayList<HashMap<String, Integer>>();
            HashMap<String, Integer> doc =null;

            for(File newFile : listOfFiles){
                doc = new HashMap<String, Integer>();
                int count=0;    
                String DocId = newFile.getName();
                String c=FileUtils.readFileToString(newFile);
                String[] w = c.split(" ");
                    for(String s1 : w){
                        if(s.equals(s1)){
                            count++;
                        }
                    }
                    if(count != 0){ 
                    doc.put(DocId, count);
                    Hash.add(doc);
                    }
            }
                    words.put(s, Hash);
        }
        PrintStream out = new PrintStream(new FileOutputStream("output.txt"));
        System.setOut(out);
        for (String name: words.keySet()){

            String key =name.toString();
            String value = words.get(name).toString();  
            System.out.print(key + " " + value);
            System.out.println("");
        } 

我已经使用Java创建了一个Indexer,但问题是当Document(Corpus)体积很小时,它表现良好。但是当语料库的大小是50,000个文本文件时。它会给出错误(内存不足:Java堆空间),并且会运行很长时间。请指出需要做些什么来降低其复杂性。

2 个答案:

答案 0 :(得分:1)

小批量索引,不要将整个数据集保存在内存中。

答案 1 :(得分:0)

没有理由将整个文件读入内存。一次扫描一个单词。当然没有理由两次阅读它们。