我试图从字符串和打印句子中获得3个最常用的单词,其中包括这些单词。目前我发现了很多关于如何在字符串中获得最常用单词的信息,但是我只是只获得其中的3个以及如何打印包含它们的句子。 目前我处于这种状态:
<!DOCTYPE html>
<html>
<body>
<p id="demo"></p>
<script>
var text = '<pre>'This is a sentence with few words in it. Repeated things. This is a sentence with words that are repeated. I don't need this text.';
var wordRegExp = /\w+(?:'\w{1,2})?/g;
var words = {};
var matches;
while ((matches = wordRegExp.exec(text)) != null)
{
var word = matches[0].toLowerCase();
if (typeof words[word] == "undefined")
{
words[word] = 1;
}
else
{
words[word]++;
}
}
var wordList = [];
for (var word in words)
{
if (words.hasOwnProperty(word))
{
wordList.push([word, words[word]]);
}
}
wordList.sort(function(a, b) { return b[1] - a[1]; });
var n = 100;
var message = ["The top words are:"];
for (var i = 0; i < n; i++)
{
if (wordList[i][0].length>=5){
message.push(wordList[i][0] + " - " + wordList[i][1] + " occurance" +
(wordList[i][1] == 1 ? "" : "s"));
document.write(wordList[i][0] + "<br />");
}
}
alert(message.join("\n"));
</script>
</body>
</html>
我正在使用
if (wordList[i][0].length>=5)
避免连词,因为它们大多数是由3-4个字母组成的。
我应该得到这样的答案:
'This is a sentence with few words in it. Repeated things. This is a sentence with words that are repeated.'
因为3个最重复的单词将是:“句子”“重复”和“单词”。 你知道如何实现这个目标吗? 一些指导意见将不胜感激。
答案 0 :(得分:0)
以下是我关注清晰度而不是表现的例子:
FIDDLE DEMO (corrected to top 3 rather than 2)
var text = "This is a sentence with few words in it. Repeated things. This is a sentence with words that are repeated. I don't need this text.";
var top3Words = topWordsIn(text, 3); //["sentence", "words"]
function topWordsIn(str, top) {
var occurences = wordsOccurence(str),
outConjuctions = function (word) { return word.length > 4; },
byOccurenceDesc = function (a, b) { return occurences[b] - occurences[a]; };
return Object.keys(occurences)
.filter(outConjuctions)
.sort(byOccurenceDesc)
.slice(0, top || void(0));
}
function wordsOccurence(str) {
var occurences = {};
forWordsIn(str, function (word) {
if (occurences[word]) ++occurences[word];
else occurences[word] = 1;
});
return occurences;
}
function forWordsIn(str, cb) {
var rx = /\b\w+\b/g, match;
while (match = rx.exec(str)) cb(match[0].toLowerCase());
}
然后,一旦你有顶级单词,你可以简单地将你的字符串标记为句子(我只是在这里分成点),并过滤掉不包含顶级单词的句子。
text.split(/\./).filter(function (sentence) {
var topWordsRx = new RegExp('\\b(' + top2Words.join('|') + ')\\b', 'i');
return topWordsRx.test(sentence);
});
//["This is a sentence with few words in it", " This is a sentence with words that are repeated"]
请注意,您可以通过跟踪单词标记化过程中单词所属的句子来提高算法效率。