如何打印具有特定单词的句子

时间:2014-08-08 13:48:17

标签: javascript regex

我试图从字符串和打印句子中获得3个最常用的单词,其中包括这些单词。目前我发现了很多关于如何在字符串中获得最常用单词的信息,但是我只是只获得其中的3个以及如何打印包含它们的句子。 目前我处于这种状态:

<!DOCTYPE html>
<html>
<body>
<p id="demo"></p>
<script>
var text = '<pre>'This is a sentence with few words in it. Repeated things. This is a sentence with words that are repeated. I don't need this text.';

var wordRegExp = /\w+(?:'\w{1,2})?/g;
var words = {};
var matches;
while ((matches = wordRegExp.exec(text)) != null)
{
    var word = matches[0].toLowerCase();
    if (typeof words[word] == "undefined")
    {
        words[word] = 1;
    }
else
{
    words[word]++;
}
}

var wordList = [];
for (var word in words)
{
if (words.hasOwnProperty(word))
{
    wordList.push([word, words[word]]);
}
}

wordList.sort(function(a, b) { return b[1] - a[1]; });

var n = 100;
var message = ["The top words are:"];
for (var i = 0; i < n; i++)
{
if (wordList[i][0].length>=5){
message.push(wordList[i][0] + " - " + wordList[i][1] + " occurance" +
            (wordList[i][1] == 1 ? "" : "s"));      

document.write(wordList[i][0] + "<br />");
}
}

alert(message.join("\n"));


</script>

</body>
</html>
我正在使用      if (wordList[i][0].length>=5) 避免连词,因为它们大多数是由3-4个字母组成的。

我应该得到这样的答案:

'This is a sentence with few words in it. Repeated things. This is a sentence with words that are repeated.'

因为3个最重复的单词将是:“句子”“重复”和“单词”。 你知道如何实现这个目标吗? 一些指导意见将不胜感激。

1 个答案:

答案 0 :(得分:0)

以下是我关注清晰度而不是表现的例子:

FIDDLE DEMO (corrected to top 3 rather than 2)

var text = "This is a sentence with few words in it. Repeated things. This is a sentence with words that are repeated. I don't need this text.";

var top3Words = topWordsIn(text, 3); //["sentence", "words"]

function topWordsIn(str, top) {
    var occurences = wordsOccurence(str),
        outConjuctions = function (word) { return word.length > 4; },
        byOccurenceDesc = function (a, b) { return occurences[b] - occurences[a]; };

    return Object.keys(occurences)
       .filter(outConjuctions)
       .sort(byOccurenceDesc)
       .slice(0, top || void(0));
}

function wordsOccurence(str) {
    var occurences = {};

    forWordsIn(str, function (word) {
        if (occurences[word]) ++occurences[word];
        else occurences[word] = 1;
    });

    return occurences;
}

function forWordsIn(str, cb) {
    var rx = /\b\w+\b/g, match;

    while (match = rx.exec(str)) cb(match[0].toLowerCase());
}

然后,一旦你有顶级单词,你可以简单地将你的字符串标记为句子(我只是在这里分成点),并过滤掉不包含顶级单词的句子。

text.split(/\./).filter(function (sentence) {
    var topWordsRx = new RegExp('\\b(' + top2Words.join('|') + ')\\b', 'i');

    return topWordsRx.test(sentence);
});
//["This is a sentence with few words in it", " This is a sentence with words that are repeated"]

请注意,您可以通过跟踪单词标记化过程中单词所属的句子来提高算法效率。