Question

我想阅读一个.txt文件，取出该文本中的所有单词，然后打印出文本中每个单词的次数。

例如，请使用以下文字：

"A bright day a man walked into a bar and asked for a beer. He was denied the beer because he was a sheep"

应该这样：

5: a
2: he, beer
1: bright, day, walked, into, bar, asked, denied, was, the,because, man, and, for

这是我的代码：

private void computeFrequencyMap() throws IOException {
    TreeMap<String, Integer> dMap = new TreeMap<String, Integer>();
    BufferedReader br = new BufferedReader(new FileReader(filen));
    String line;
    while( (line = br.readLine()) != null){
         String [] words = line.split("\\s+");
      for (String word : words) {
          word = word.replaceAll("[^a-zA-Z]", "");
        if (!dMap.containsKey(word.toLowerCase())) {
          dMap.put(word.toLowerCase(), 1);
        } else {
          int count = dMap.get(word.toLowerCase());
          dMap.put(word.toLowerCase(), count + 1);
        }
      }
    }
    TreeMap<Integer, HashSet<String>> sMap = new TreeMap<Integer, HashSet<String>>();
    for (Map.Entry<String, Integer> entry : dMap.entrySet()) {
        if(sMap.containsKey(entry.getValue())){
            //sMap.put(entry.getValue(), entry.getKey());
        }else{
            sMap.put(entry.getValue(), new HashSet<String>());
        }
   }
   for (Entry<Integer, HashSet<String>> entry : sMap.entrySet()) {
        System.out.println(entry.getKey() + " " + entry.getValue());
   }
}

这是我运行代码时得到的结果：

1 []
2 []
3 []
4 []
5 []
6 []
7 []
8 []
9 []
10 []
11 []
12 []
14 []
16 []
18 []
27 []
32 []
33 []
38 []
44 []
54 []
71 []

我只是到目前为止，无法获得“添加到设置”部分，有任何建议吗？

Answer 1

您忘记在倒转地图时实际添加单词：

Map<Integer, Set<String>> sMap = new TreeMap<>();
for (Map.Entry<String, Integer> entry : dMap.entrySet()) {
    Integer appearances = entry.getValue();
    Set<String> words = sMap.get(appearances);
    if (words == null) {
        words = new HashSet<>();
        sMap.put(appearances, words);
    }
    words.add(entry.getKey());
}

Answer 2

这应该可以解决您的问题

private void computeFrequencyMap() throws IOException {
        TreeMap<String, Integer> dMap = new TreeMap<String, Integer>();
        BufferedReader br = new BufferedReader(new FileReader("input.txt"));
        String line;
        while ((line = br.readLine()) != null) {
            String[] words = line.split("\\s+");
            for (String word : words) {
                word = word.replaceAll("[^a-zA-Z]", "");
                if (!dMap.containsKey(word.toLowerCase())) {
                    dMap.put(word.toLowerCase(), 1);
                } else {
                    int count = dMap.get(word.toLowerCase());
                    dMap.put(word.toLowerCase(), count + 1);
                }
            }
        }
        System.out.println(dMap);
        TreeMap<Integer, HashSet<String>> sMap = new TreeMap<Integer, HashSet<String>>();
        for (Map.Entry<String, Integer> entry : dMap.entrySet()) {
            if (sMap.containsKey(entry.getValue())) {
                 ((Set<String>)sMap.get(entry.getValue())).add(entry.getKey());
            } else {
                HashSet<String> set = new LinkedHashSet<String>();
                set.add(entry.getKey());
                sMap.put(entry.getValue(), set);
            }
        }
        for (Entry<Integer, HashSet<String>> entry : sMap.entrySet()) {
            System.out.println(entry.getKey() + " " + entry.getValue());
        }
    }

Answer 3

这不是Java答案。但您的问题是如何使用经典Unix shell工具（tr，sed，sort，uniq）的完美示例：

tr将所有大写字母转换为小写字母
sed删除了点.
sort按字母顺序排序
uniq统计他们
sort对计数进行排序

试试这个：

$ tr '[:upper:]' '[:lower:]' < so.txt |\
  sed 's/\.//' |\
  tr " " "\n" |\
  sort |\
  uniq -c |\
  sort -r -n

输出：

      5 a
      2 was
      2 he
      2 beer
      1 walked
      1 the
      1 sheep
      1 man
      1 into
      1 for
      1 denied
      1 day
      1 bright
      1 because
      1 bar
      1 asked
      1 and

Answer 4

只是为了好玩，以下是如何在Java 8中完成的：

String content = "A bright day a man walked into a bar and asked for a beer. He was denied the beer because he was a sheep";
Map<Long, List<String>> mapByCount = Arrays.stream(content.split("( |\\.)+"))
    .map(String::toLowerCase)
    .collect(groupingBy(Function.identity(), counting()))
    .entrySet()
    .stream()
    .collect(
        groupingBy(
            Map.Entry::getValue,
            () -> new TreeMap<>(Collections.reverseOrder()),
            mapping(Map.Entry::getKey, toList())
        )
    );

System.out.println(mapByCount);

<强>输出：

{
    5=[a],
    2=[was, he, beer], 
    1=[for, bright, the, into, bar, walked, asked, and, because, man, denied, day, sheep] 
}

Answer 5

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

public class Test {
    private static final String REGEX = "( |\\.)+";

    public static Map<String, Integer> getWordCount(String txt) {
        Map<String, Integer> map = new HashMap<>();

        for (String str : txt.split(REGEX)) {
            str = str.toLowerCase();

            if (!map.containsKey(str)) {
                map.put(str, 1);
                continue;
            }

            int count = map.get(str);
            map.put(str, count + 1);
        }
        return map;
    }

    public static Map<Integer, String> reverseMap(Map<String, Integer> wordCount) {
        Map<Integer, String> map = new HashMap<>();

        Set<String> keys = wordCount.keySet();

        for (String key : keys) {
            int val = wordCount.get(key);
            if (!map.containsKey(val)) {
                map.put(val, key);
                continue;
            } else {
                String data = map.get(val) + "," + key;
                map.put(val, data);
            }
        }

        return map;

    }

    public static void main(String args[]) throws IOException {
        String content = new String(Files.readAllBytes(Paths
                .get("input.txt")));

        Map<String, Integer> wordCount = getWordCount(content);
        Map<Integer, String> result = reverseMap(wordCount);
        System.out.println(result);

    }
}

设置＆LT;＆GT;在地图＆lt;＆gt;里面，打印一切？

5 个答案: