Javascript单词频率书签省略了输出

时间:2015-11-06 10:17:26

标签: javascript frequency bookmarklet word

我找不到Chrome的单词频率扩展名列出了某个单词在页面上使用的次数(我需要按照使用频率排列至少100个结果列表),所以我使用了复制javascript bookmarklet并稍微调整一下以过滤常用词。

但是,原始代码和修改后的代码都会输出一个列表,该列表省略了某些单词的第一个字母,例如" roperty"而不是"属性"," ubversion"而不是"颠覆"等等。是什么导致了这个?

以下是原始代码的链接:https://gist.github.com/RonnyO/3004194

以下是我稍微调整后的代码:

javascript: (function () {
            var settings = {
                           listLength: 100,
                           ignore: ['the', 'be', 'to', 'of', 'and', 'in', 'that', 'have', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'fake', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us']
                    },
                    w, s;
            function getBodyText() {
                    var doc = document,
                            body = doc.body,
                            selection, range, bodyText;
            if (body.createTextRange) {
                            return body.createTextRange().text;
            } else if (getSelection) {
                            selection = getSelection();
                            range = doc.createRange();
                            range.selectNodeContents(body);
                            selection.addRange(range);
                            bodyText = selection.toString();
                            selection.removeAllRanges();
                            return bodyText;
            }
     }

     var punctuation = /[\/\.\*\+\+\?\|\(\)\[\]\{\}\^\\,:;-`~!@#$%&_]+/g;
     var words = getBodyText().trim().replace(punctuation, ' ').replace(/\s+/g, ' ').split(' '),
               count = {},
               sorted = [];

    for (w in words) {if (words.hasOwnProperty(w) && settings.ignore.indexOf(words[w]) == -1) {
        var word = words[w];
        count[word] = count[word] ? count[word] + 1 : 1;
    }
}

for (w in count) if (count.hasOwnProperty(w)) {
    sorted.push([w, count[w]]);
}

s = sorted.sort(function (a, b) {
    return b[1] - a[1];
});

var output = '<title>word frequency</title><ul style="direction: ltr; text-align: left; font-family: sans-serif; line-height: 130%;">';
for (s in sorted.slice(0, settings.listLength)) {
    var c = sorted[s];
    output += '<li>' + c[1] + ': ' + c[0] + '</li>';
}
output += '</ul>';

with(open().document){
    write(output);
    close();
}
})();

抱歉可怕的缩进..

1 个答案:

答案 0 :(得分:0)

更改标点符号以逃避连字符。

var punctuation = /[\/\.\*\+\+\?\|\(\)\[\]\{\}\^\\,:;\-`~!@#$%&_]+/g;