如何使用java增加频繁出现的单词的数量

时间:2013-07-05 07:18:39

标签: java map count word-frequency

下面是我的程序,它显示了前10个经常出现的单词,但我的要求是获得前30个经常出现的单词,

class FrequencyCount {
int[][] table = new int[4][1000000];
TreeMap<Integer, List<String>> map = new TreeMap<Integer, List<String>>(
            Collections.reverseOrder());
public static void main(String[] args) throws Exception {
        FrequencyCount freq = new FrequencyCount();
        BufferedReader br = null;
try {

        br = new BufferedReader(new FileReader(new File(
                "C:/portable.pdf")));
        String fileline = br.readLine();
        System.out.println("fileline:" + fileline);
        while (fileline != null) {
            if (fileline.length() > 0) {
                String[] sArr = fileline.split(" ");
                for (String s : sArr) {
                    int flag = 1;
                    for (int j = 0; j < stopwords.length; j++) {
                        String s1 = s.toLowerCase();

                    }
                    if (flag != 0) {
                        if (s.trim().length() > 0) {
                            try {
                                freq.add(freq.trimStr(s));
                            } catch (ArrayIndexOutOfBoundsException e) {

                            }
                        }
                    }
                }
            }
            fileline = br.readLine();
        }

        Set<Integer> set = freq.map.keySet();
        for (Integer x : set) {
            System.out.println(freq.map.get(x) + " found " + x + " times");
        }

    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        br.close();
    }
}

public String trimStr(String s) {
    if (s.toUpperCase().equals(s.toLowerCase())) {
        return s;
    }
    s = s.toLowerCase().trim();

    if (s.endsWith("'s")) {
        s = s.substring(0, s.length() - 2);
    }
    int i = 0;
    int j = s.length() - 1;
    char[] cArr = s.toCharArray();

    while (!(cArr[i] >= 65 && cArr[i] <= 90)
            && !(cArr[i] >= 97 && cArr[i] <= 122)) {
        i++;
    }

    while (!(cArr[j] >= 65 && cArr[j] <= 90)
            && !(cArr[j] >= 97 && cArr[j] <= 122)) {
        j--;
    }

    return s.substring(i, j + 1);
}

public int[] hash(String s) {
    int h1 = hash1(s);
    int h2 = hash2(s);
    int h3 = hash3(s);
    int h4 = hash4(s);

    int[] res = new int[] { h1, h2, h3, h4 };
    return res;
}

public int hash1(String x) {
    char ch[] = x.toCharArray();
    int i, sum;
    for (sum = 0, i = 0; i < x.length(); i++)
        sum += ch[i];
    return sum % 1000000;
}

public int hash2(String s) {
    int h = 0;
    for (int i = 0; i < s.length(); i++) {
        h = 31 * h + s.charAt(i);
    }
    h = h % 1000000;
    if (h < 0) {
        h = -h;
    }
    return h;
}

public int hash3(String s) {
    int h = 0;
    for (int i = 0; i < s.length(); i++) {
        h = 17 * h + s.charAt(i);
    }
    h = h % 1000000;
    if (h < 0) {
        h = -h;
    }
    return h;
}

public int hash4(String s) {
    int h = 0;
    for (int i = 0; i < s.length(); i++) {
        h = 11 * h + s.charAt(i);
    }
    h = h % 1000000;
    if (h < 0) {
        h = -h;
    }
    return h;
}

public void add(String s) {

    int[] h = hash(s);
    table[0][h[0]] = table[0][h[0]] + 1;
    table[1][h[1]] = table[1][h[1]] + 1;
    table[2][h[2]] = table[2][h[2]] + 1;
    table[3][h[3]] = table[3][h[3]] + 1;
    int r = Math.min(Math.min(Math.min(table[0][h[0]], table[1][h[1]]),
            table[2][h[2]]), table[3][h[3]]);
    boolean add = true;
    List<String> list = map.get(r);
    if (list == null) {
        if (map.size() == 10) {
            Integer lastKey = map.lastKey();
            if (lastKey.intValue() > r) {
                add = false;
            } else {
                map.remove(lastKey);
            }
        }
        list = new ArrayList<String>();
    }
    if (add) {
        list.add(s);
        map.put(r, list);
        if (r > 1) {
            list = map.get(r - 1);
            if (list != null) {
                if (list.size() == 1) {
                    map.remove(r - 1);
                } else {
                    list.remove(s);
                }
            }
        }
    }
}

public int count(String s) {
    int[] h = hash(s);
    int a = table[0][h[0]];
    int b = table[1][h[1]];
    int c = table[2][h[2]];
    int d = table[3][h[3]];
    int r = Math.min(Math.min(Math.min(a, b), c), d);
    return r;
}
}

我已将地图大小更改为30,但它无效,请建议我如何获得前30个经常出现的单词。

谢谢

2 个答案:

答案 0 :(得分:0)

我会使用不同的方法。我会在HashMap中添加存储单词,并使用小写字符串作为键,然后使用它作为值发生的时间。创建整个地图后,您可以将其排序为described here并根据需要显示顶部的任何数字。

总体思路:

HashMap<String, Integer> wordcounter = new HashMap<String, Integer>();

    if (wordcounter.containsKey(s))
        wordcounter.put(s, wordcounter.get(s) + 1);
    else
        wordcounter.put(s, 1);

答案 1 :(得分:0)

如下所述,只有一个词从10改为30。

public void add(String s) {

    int[] h = hash(s);
    table[0][h[0]] = table[0][h[0]] + 1;
    table[1][h[1]] = table[1][h[1]] + 1;
    table[2][h[2]] = table[2][h[2]] + 1;
    table[3][h[3]] = table[3][h[3]] + 1;
    int r = Math.min(Math.min(Math.min(table[0][h[0]], table[1][h[1]]),
            table[2][h[2]]), table[3][h[3]]);
    boolean add = true;
    List<String> list = map.get(r);
    if (list == null) {
        if (map.size() == 30) {//Changed from 10 to 30

            Integer lastKey = map.lastKey();
            if (lastKey.intValue() > r) {
                add = false;
            } else {
                map.remove(lastKey);
            }
        }
        list = new ArrayList<String>();
    }
    if (add) {
        list.add(s);
        map.put(r, list);
        if (r > 1) {
            list = map.get(r - 1);
            if (list != null) {
                if (list.size() == 1) {
                    map.remove(r - 1);
                } else {
                    list.remove(s);
                }
            }
        }
    }
}