我想在java中构建一个倒排索引。我有1400个文本文件的数据。 我能够计算每个术语/单词的频率。我已经能够返回整个集合中出现的单词的次数,但是我无法返回单词出现的文档。这是我到目前为止的代码:
我希望以下列形式输出 term1:doc1:2,doc2:3 term2:doc1:3,doc4:1 ...............那么
此处术语是doc文件中的单词,doc 1:2表示term1出现在doc 1中2次
public static void main(String[]args) throws FileNotFoundException{
Map<String, Integer> m = new HashMap<>();
String wrd;
for(int i=1;i<=2;i++){
//FileInputStream tdfr = new FileInputStream("D:\\logs\\steem"+i+".txt");
Scanner tdsc=new Scanner(new File("D:\\logs\\steem"+i+".txt"));
while(tdsc.hasNext()){
// m.clear();
Integer docid=i;
wrd=tdsc.next();
//Vector<Integer> vPosList = p.hPosList.get(wrd);
Integer freq=m.get(wrd);
//Integer doc=m1.get(i);
//System.out.println(m.get(wrd));
m.put(wrd, (freq == null) ? 1 : freq + 1);
}
System.out.println(m.size() + " distinct words" + " steem" +i);
System.out.println("Doc" +i+""+m);
//System.out.println("Doc"+i+""+m1);
m.clear();
tdsc.close();
}
//System.out.println(m.size() + " distinct words");
//System.out.println(m);
// System.out.println(m1);
}
}
答案 0 :(得分:0)
public static void main(String[]args) throws FileNotFoundException{
Map<String, Set<Doc>> wordDocMap = new HashMap<>();
for(int i=1;i<=2;i++){
Scanner tdsc = new Scanner(new File("D:\\logs\\steem"+i+".txt"));
Doc document = new Doc("doc"+i);
while(tdsc.hasNext()){
String word = tdsc.next();
document.put(word);
Set<Doc> documents = wordDocMap.get(word);
if(documents == null){
documents = new HashSet<>();
wordDocMap.put(word, documents);
}
documents.add(document);
}
tdsc.close();
}
StringBuilder builder = new StringBuilder();
for(String word: wordDocMap.keySet()) {
Set<Doc> documents = wordDocMap.get(word);
builder.append(word + ":");
for(Doc document:documents){
builder.append(document.getDocName() +":"+ document.getCount(word));
builder.append(", ");
}
builder.delete(builder.length()-2, builder.length()-1);
builder.append("\n");
}
System.out.println(builder);
}
static class Doc {
String docName;
Map<String, Integer> m = new HashMap<>();
public Doc(String docName){
this.docName = docName;
}
public void put(String word) {
Integer freq = m.get(word);
m.put(word, (freq == null) ? 1 : freq + 1);
}
public Integer getCount(String word) {
return m.get(word);
}
public String getDocName() {
return this.docName;
}
}