我找不到Chrome的单词频率扩展名列出了某个单词在页面上使用的次数(我需要按照使用频率排列至少100个结果列表),所以我使用了复制javascript bookmarklet并稍微调整一下以过滤常用词。
但是,原始代码和修改后的代码都会输出一个列表,该列表省略了某些单词的第一个字母,例如" roperty"而不是"属性"," ubversion"而不是"颠覆"等等。是什么导致了这个?
以下是原始代码的链接:https://gist.github.com/RonnyO/3004194
以下是我稍微调整后的代码:
javascript: (function () {
var settings = {
listLength: 100,
ignore: ['the', 'be', 'to', 'of', 'and', 'in', 'that', 'have', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'fake', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us']
},
w, s;
function getBodyText() {
var doc = document,
body = doc.body,
selection, range, bodyText;
if (body.createTextRange) {
return body.createTextRange().text;
} else if (getSelection) {
selection = getSelection();
range = doc.createRange();
range.selectNodeContents(body);
selection.addRange(range);
bodyText = selection.toString();
selection.removeAllRanges();
return bodyText;
}
}
var punctuation = /[\/\.\*\+\+\?\|\(\)\[\]\{\}\^\\,:;-`~!@#$%&_]+/g;
var words = getBodyText().trim().replace(punctuation, ' ').replace(/\s+/g, ' ').split(' '),
count = {},
sorted = [];
for (w in words) {if (words.hasOwnProperty(w) && settings.ignore.indexOf(words[w]) == -1) {
var word = words[w];
count[word] = count[word] ? count[word] + 1 : 1;
}
}
for (w in count) if (count.hasOwnProperty(w)) {
sorted.push([w, count[w]]);
}
s = sorted.sort(function (a, b) {
return b[1] - a[1];
});
var output = '<title>word frequency</title><ul style="direction: ltr; text-align: left; font-family: sans-serif; line-height: 130%;">';
for (s in sorted.slice(0, settings.listLength)) {
var c = sorted[s];
output += '<li>' + c[1] + ': ' + c[0] + '</li>';
}
output += '</ul>';
with(open().document){
write(output);
close();
}
})();
抱歉可怕的缩进..
答案 0 :(得分:0)
更改标点符号以逃避连字符。
var punctuation = /[\/\.\*\+\+\?\|\(\)\[\]\{\}\^\\,:;\-`~!@#$%&_]+/g;