import java.util.TreeMap;
import java.io.*;
import java.util.Map;
public class ReadFile {
public static TreeMap<String, Integer> generateFrequencyList()
throws IOException {
TreeMap<String, Integer> wordsFrequencyMap = new TreeMap<String, Integer>();
String file = "file1.txt";
BufferedReader br = new BufferedReader(new FileReader(file));
String line;
while ((line = br.readLine()) != null) {
String[] tokens = line.split("\\s+");
for (String token : tokens) {
token = removePunctuation(token);
if (!wordsFrequencyMap.containsKey(token.toLowerCase())) {
wordsFrequencyMap.put(token.toLowerCase(), 1);
} else {
int count = wordsFrequencyMap.get(token.toLowerCase());
wordsFrequencyMap.put(token.toLowerCase(), count + 1);
}
}
}
return wordsFrequencyMap;
}
private static String removePunctuation(String token) {
token = token.replaceAll(",", "").replaceAll("\\.", "").replaceAll(";", "").replaceAll("!", "");
return token;
}
public static void main(String[] args) {
try {
TreeMap<String, Integer> freqMap = generateFrequencyList();
for (final Map.Entry<String, Integer> entry : freqMap.entrySet()) {
final String key = entry.getKey();
final Integer value = entry.getValue();
float total = 0;
for (final Integer wordCount : freqMap.values()) {
total += wordCount;
}
final float percentage = (value / total) * 100;
System.out.println(key + " = " + value + " => " + percentage);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
我需要这个程序来读取.txt文件并返回单词列表,频率和百分比。我正在使用的.txt文件比打印出的文字多。它似乎只打印出来自T-Z的单词。我不知道如何解决它给我所有的话。任何人都有一些想法,为什么它没有给我完整的单词列表?
答案 0 :(得分:0)
为我工作正常..也许你的测试文件和示例可能有帮助.. 只是一个提示..而在它可能想要循环计算在for循环之外的“总计”。 :)
i / p我需要这个程序来读取.txt文件并返回一个单词列表, 频率和百分比。我使用的.txt文件比单词更多 打印出来。它似乎只打印出T中的单词 - Z.我不知道如何修复它给我所有的话。任何人 有一些想法,为什么它没有给我完整的单词列表?
o / p“ - = 1
=&GT; 1.388889 a = 1 =&gt; 1.388889 all = 1 =&gt; 1.388889 am = 2 =&gt; 2.777778和= 2 =&gt; 2.777778 everyone = 1 =&gt; 1.388889 be = 1 =&gt; 1.388889 = 1 =&GT; 1.388889 file = 2 =&gt; 2.777778 fix = 1 =&gt; 1.388889 frequency = 1 =&gt; 1.388889 from = 1 =&gt; 1.388889 full = 1 =&gt; 1.388889 give = 2 =&gt; 2.777778 has = 1 =&gt; 1.388889 have = 1 =&gt; 1.388889 how = 1 =&gt; 1.388889 i = 3 =&gt; 4.166667 ideas = 1 =&gt; 1.388889 it = 3 =&gt; 4.166667 list = 1 =&gt; 1.388889列表? = 1 =&gt; 1.388889 me = 2 =&gt; 2.777778 more = 1 =&gt; 1.388889 need = 1 =&gt; 1.388889 not = 2 =&gt; 2.777778 of = 1 =&gt; 1.388889仅= 1 =&GT; 1.388889 out = 2 =&gt; 2.777778百分比= 1 =&gt; 1.388889 printing = 1 =&gt; 1.388889 prints = 1 =&gt; 1.388889 program = 1 =&gt; 1.388889读= 1 =&GT; 1.388889 return = 1 =&gt; 1.388889似乎= 1 =&gt; 1.388889 some = 1 =&gt; 1.388889 sure = 1 =&gt; 1.388889 t = 1 =&gt; 1.388889比= 1 =&gt; 1.388889 = 5 =&gt; 6.9444447这= 2 =&gt; 2.777778至= 4 =&gt; 5.555556 txt = 2 =&GT; 2.777778使用= 1 =&gt; 1.388889为什么= 1 =&gt; 1.388889 word = 2 =&gt; 2.777778 words = 3 =&gt; 4.166667 z = 1 =&gt; 1.388889"
答案 1 :(得分:0)
我运行你的代码,看起来也很好。这是我的输出:
a = 5 => 3.164557
accept = 1 => 0.6329114
acceptance = 1 => 0.6329114
accepts = 1 => 0.6329114
and = 1 => 0.6329114
any = 3 => 1.8987341
applicable = 1 => 0.6329114
are = 2 => 1.2658228
at = 2 => 1.2658228
available = 1 => 0.6329114
be = 4 => 2.5316455
been = 1 => 0.6329114
being = 1 => 0.6329114
billed = 1 => 0.6329114
buy = 1 => 0.6329114
by = 2 => 1.2658228
can = 1 => 0.6329114
card = 1 => 0.6329114
charge = 1 => 0.6329114
charges = 1 => 0.6329114
confirmation = 1 => 0.6329114
constitute = 1 => 0.6329114
credit = 1 => 0.6329114
customer = 2 => 1.2658228
deposit = 1 => 0.6329114
direct = 1 => 0.6329114
do = 1 => 0.6329114
does = 1 => 0.6329114
email = 3 => 1.8987341
for = 3 => 1.8987341
from = 2 => 1.2658228
hardware = 1 => 0.6329114
has = 1 => 0.6329114
if = 2 => 1.2658228
is = 1 => 0.6329114
lenovo = 3 => 1.8987341
lenovo's = 1 => 0.6329114
limit = 1 => 0.6329114
locations = 1 => 0.6329114
made = 1 => 0.6329114
making = 1 => 0.6329114
may = 1 => 0.6329114
multiple = 1 => 0.6329114
no = 1 => 0.6329114
not = 3 => 1.8987341
notify = 1 => 0.6329114
number = 1 => 0.6329114
of = 2 => 1.2658228
once = 1 => 0.6329114
one = 1 => 0.6329114
only = 1 => 0.6329114
or = 5 => 3.164557
order = 5 => 3.164557
orders = 1 => 0.6329114
particular = 1 => 0.6329114
payment = 1 => 0.6329114
phone = 1 => 0.6329114
processed = 2 => 1.2658228
product = 4 => 2.5316455
providing = 1 => 0.6329114
purchase = 1 => 0.6329114
reason = 1 => 0.6329114
receive = 1 => 0.6329114
refunded = 1 => 0.6329114
refuse = 1 => 0.6329114
reserves = 1 => 0.6329114
right = 1 => 0.6329114
sell = 1 => 0.6329114
service = 1 => 0.6329114
shipped = 2 => 1.2658228
shipping = 4 => 2.5316455
single = 1 => 0.6329114
software = 1 => 0.6329114
some = 1 => 0.6329114
thank = 1 => 0.6329114
that = 1 => 0.6329114
the = 5 => 3.164557
this = 2 => 1.2658228
time = 2 => 1.2658228
to = 6 => 3.7974682
units = 1 => 0.6329114
updates = 1 => 0.6329114
we = 2 => 1.2658228
will = 5 => 3.164557
you = 5 => 3.164557
your = 8 => 5.063291
仅供参考我确实添加了br.close(),因为我的编辑抱怨BufferReader没有被关闭。但是这不会影响输出。
}
}
br.close();
return wordsFrequencyMap;