import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import org.apache.commons.io.FileUtils;
public class indexer {
@SuppressWarnings("unchecked")
public static void main(String[] args) throws IOException{
HindiStemmerLight shl = new
//HindiStemmerLight();
Scanner in1 = new Scanner(System.in);
System.out.println("");
System.out.println("Enter the File Path");
String path= in1.next();
File folder = new File(path);
File[] listOfFiles = folder.listFiles();
ArrayList<String> array = new ArrayList<String>();
int count1 = 0 ;
ArrayList<String> stopwords = new
ArrayList<String>(); File files = new File("/home/gaurav/stop-words_hindi_1_hi.txt");
String stopWordsFile=FileUtils.readFileToString(files);
String[] stopWords = stopWordsFile.split(" ");
for(String str:stopWords){
stopwords.add(str);
}
System.out.println("");
for (int i = 0; i <listOfFiles.length; i++) { //Reading the contents of each file
File file = listOfFiles[i];
if (file.isFile() && file.getName().endsWith(".txt")) {
String content = FileUtils.readFileToString(file); //storing the contents of files in content
String[] a=content.split("");
for(String s:a){
s= s.trim();
if(stopwords.contains(s)){
}
else{
//shl.stem(s); //applying the hindi stemmer on each word
// if(!array.contains(s)) // storing each word encountered into arraylist - array
array.add(s);
}
}
}
}
Arrays.sort(listOfFiles, new Comparator()
{
@Override
public int compare(Object f1, Object f2) {
return ((File) f1).getName().compareTo(((File) f2).getName());
}
});
Map<String, ArrayList<HashMap<String, Integer>>> words = new TreeMap<String, ArrayList<HashMap<String, Integer>>>();
Collections.sort(array);
for(int i =0 ; i<array.size();i++){
String s = array.get(i);
ArrayList<HashMap<String, Integer>> Hash = new ArrayList<HashMap<String, Integer>>();
HashMap<String, Integer> doc =null;
for(File newFile : listOfFiles){
doc = new HashMap<String, Integer>();
int count=0;
String DocId = newFile.getName();
String c=FileUtils.readFileToString(newFile);
String[] w = c.split(" ");
for(String s1 : w){
if(s.equals(s1)){
count++;
}
}
if(count != 0){
doc.put(DocId, count);
Hash.add(doc);
}
}
words.put(s, Hash);
}
PrintStream out = new PrintStream(new FileOutputStream("output.txt"));
System.setOut(out);
for (String name: words.keySet()){
String key =name.toString();
String value = words.get(name).toString();
System.out.print(key + " " + value);
System.out.println("");
}
我已经使用Java创建了一个Indexer,但问题是当Document(Corpus)体积很小时,它表现良好。但是当语料库的大小是50,000个文本文件时。它会给出错误(内存不足:Java堆空间),并且会运行很长时间。请指出需要做些什么来降低其复杂性。
答案 0 :(得分:1)
小批量索引,不要将整个数据集保存在内存中。
答案 1 :(得分:0)
没有理由将整个文件读入内存。一次扫描一个单词。当然没有理由两次阅读它们。