for (a = 0; a < filename; a++) {
try {
System.out
.println(" _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ");
System.out.println("\n");
System.out.println("The word inputted : " + word2);
File file = new File(
"C:\\Users\\user\\fypworkspace\\TextRenderer\\abc" + a
+ ".txt");
System.out.println(" _________________");
System.out.print("| File = abc" + a + ".txt | \t\t \n");
for (int i = 0; i < array2.length; i++) {
totalCount = 0;
wordCount = 0;
Scanner s = new Scanner(file);
{
while (s.hasNext()) {
totalCount++;
if (s.next().equals(array2[i]))
wordCount++;
}
System.out.print(array2[i] + " --> Word count = "
+ "\t " + "|" + wordCount + "|");
System.out.print(" Total count = " + "\t " + "|"
+ totalCount + "|");
System.out.printf(" Term Frequency = | %8.4f |",
(double) wordCount / totalCount);
System.out.println("\t ");
double inverseTF = Math.log10((float) numDoc
/ (numofDoc[i]));
System.out.println(" --> IDF = " + inverseTF );
double TFIDF = (((double) wordCount / totalCount) * inverseTF);
System.out.println(" --> TF/IDF = " + TFIDF + "\n");
}
}
} catch (FileNotFoundException e) {
System.out.println("File is not found");
}
}
}
}
这是示例输出:
输入的单词:你好吗
| File = abc0.txt |
how - &gt;字数= | 4 |总计数= | 957 |期限频率= | 0.0042 |
--> IDF = 0.5642714398516419
--> TF/IDF = 0.0023585013159943234
是 - &gt;字数= | 7 |总计数= | 957 |期限频率= | 0.0073 |
--> IDF = 0.1962946357308887
--> TF/IDF = 0.00143580193324579
你 - &gt;字数= | 10 |总计数= | 957 |期限频率= | 0.0104 |
--> IDF = 0.1962946357308887
--> TF/IDF = 0.002051145618922557
如何为每个文本文件总结整个3 TF / IDF?
答案 0 :(得分:1)
假设您只想要一个总计可以显示,然后在for loop
之前添加以下内容:
double runningTfIDF = 0;
然后在计算当前TF / IDF后立即添加线
runningTfIDF += TFIDF;
然后,在for loop
之后,您可以添加一行来打印runningTfIDF。
已修改为包含更完整的答案
HashMap<String, BigDecimal> runningTdIDF = new HashMap<String, Double>();
HashMap<String, BigDecimal> wordCount = new HashMap<String, Double>();
HashMap<String, BigDecimal> frequency = new HashMap<String, Double>();
HashMap<String, BigDecimal> inverseTF = new HashMap<String, Double>();
for (int i = 0; i < array2.length; i++) {
totalCount = 0;
wordCountVal = 0;
Scanner s = new Scanner(file);
{
while (s.hasNext()) {
totalCount++;
if (s.next().equals(array2[i]))
wordCountVal++;
}
BigDecimal wordCount(array2[i],new BigDecimal(wordCountVal));
BigDecimal frequencyVal = new BigDecimal( (double) wordCount / totalCount));
frequency.put(array2[i],frequencyVal);
BigDecimal inverseTFVal = new BigDecimal(Math.log10((float) numDoc
/ (numofDoc[i])));
inverseTF.put(array2[i], inverseTFVal);
BigDecaim TFIDF =new BigDecima( (( wordCount / totalCount) * inverseTF));
runningTfIDF.put(array2[i], TFIDF);
}
for(String word : wordCount.keySet()){
System.out.print(word + " --> word count "
+ "\t |"+wordCount.get(word)+"|");
System.out.print(" Total count = " + "\t " + "|"
+ totalCount + "|");
System.out.printf(" Term Frequency = | %8.4f |",
frequency.get(word));
System.out.println("\t ");
System.out.println(" --> IDF = " + inverseTF.get(word));
System.out.println(" --> TF/IDF = " + runningTfIDF.get(word) + "\n");
}
}
到目前为止,这不是最干净的实施方式,但简而言之,您需要存储每个单词的信息,并在创建总计后循环显示单词,如果要显示从第一个单词开始的总计结果。这有意义吗?