这段代码是在名为docs的文件夹中的40个文本文件中找到单词的TF-IDF,每当我使用这个程序时,我一直在获取空指针异常。我相信它来自computeTermFrequencies方法。我希望它能打印每个文件中的前5个TF-IDF单词。
任何帮助将不胜感激!谢谢!
import java.util.*;
import java.io.*;
public class KeywordExtractor {
public static void main(String[] args) {
String dir = args[0]; // name of directory with input files
HashMap<String, Integer> dfs;
dfs = readDocumentFrequencies("freqs.txt");
for(int i = 1; i <= 40; i++){
String name = dir + "/" + i + ".txt";
HashMap<String,Integer> tfs = computeTermFrequencies(name);
HashMap<String,Double> tfidf = computeTFIDF(tfs,dfs,40);
System.out.println(i + ".txt");
printTopKeywords(tfidf,5);
System.out.println();
}
}
//method to that takes string as input and returns hashmap with amount of times
//each word appears in the file
public static HashMap<String, Integer> computeTermFrequencies(String filename) {
HashMap<String, Integer> hm2 = new HashMap<String, Integer>();
try{
FileReader fr = new FileReader(filename);
BufferedReader br = new BufferedReader(fr);
String line = "";
line = normalize(line);
//for(String line = br.readLine(); line != null; line = br.readLine()){
while((line=br.readLine())!=null){
String[] words = line.split(" ");
for(int i = 0; i < words.length; i++){
String word = words[i];
if(hm2.containsKey(word)){
int x = hm2.get(word);
x++;
hm2.put(word,x);
}else{
hm2.put(word,1);
}
} //end for
}//end for
}catch(IOException e){
//error
}
return hm2;
}
//method to read frequency file created in another class, it returns a hashMap
public static HashMap<String, Integer> readDocumentFrequencies(String filename){
HashMap<String, Integer> hm = new HashMap<String, Integer>();
//try block
try{
//read file
FileReader fr = new FileReader(filename);
BufferedReader br = new BufferedReader(fr);
//for loop to loop through and take words and put in hashmap
for(String line = br.readLine(); line != null; line = br.readLine()){
String[] a = line.split(" ");
String word = a[0];
int number = Integer.parseInt(a[1]);
//put word in hashmap with the frequency of the word
hm.put(word,number);
if(hm.get(word)==null){
System.out.println("sads");
}
}//end for
}
catch(IOException e){
//error
}
return hm;
}
public static HashMap<String, Double> computeTFIDF(HashMap<String, Integer> tfs, HashMap<String, Integer> dfs,
double nDocs) {
HashMap<String, Double> hm3 = new HashMap<String, Double>();
for(String key:tfs.keySet()){
/*if(dfs.get(key)==null){
System.out.println(key);
}*/
double idf = Math.log(nDocs/dfs.get(key));
double tf = tfs.get(key);
hm3.put(key,tf*idf);
}
return hm3;
}
/**
* This method prints the top K keywords by TF-IDF in descending order.
*/
public static void printTopKeywords(HashMap<String, Double> tfidfs, int k) {
ValueComparator vc = new ValueComparator(tfidfs);
TreeMap<String, Double> sortedMap = new TreeMap<String, Double>(vc);
sortedMap.putAll(tfidfs);
int i = 0;
for(Map.Entry<String, Double> entry: sortedMap.entrySet()){
String key = entry.getKey();
Double value = entry.getValue();
System.out.println(key + " " + value);
i++;
if (i >= k) {
break;
}
}
}
public static String normalize(String word) {
return word.replaceAll("[^a-zA-Z ']", "").toLowerCase();
}
}
/*
* This class makes printTopKeywords work. Do not modify.
*/
class ValueComparator implements Comparator<String> {
Map<String, Double> map;
public ValueComparator(Map<String, Double> base) {
this.map = base;
}
public int compare(String a, String b) {
if (map.get(a) >= map.get(b)) {
return -1;
} else {
return 1;
} // returning 0 would merge keys
}
}