javascript中的单词频率

时间:2015-06-18 05:03:13

标签: javascript

enter image description here

如何实现javascript函数来计算给定句子中每个单词的频率。

这是我的代码:

function search () {
  var data = document.getElementById('txt').value;
  var temp = data;
  var words = new Array();
  words = temp.split(" ");
  var uniqueWords = new Array();
  var count = new Array();


  for (var i = 0; i < words.length; i++) {
    //var count=0;
    var f = 0;
    for (j = 0; j < uniqueWords.length; j++) {
      if (words[i] == uniqueWords[j]) {
        count[j] = count[j] + 1;
        //uniqueWords[j]=words[i];
        f = 1;
      }
    }
    if (f == 0) {
      count[i] = 1;
      uniqueWords[i] = words[i];
    }
    console.log("count of " + uniqueWords[i] + " - " + count[i]);
  }
}

我无法追查问题..任何帮助都非常有用。 以这种格式输出: 计数是-1 计数 - 2 ..

输入:这是anil是kum the anil

6 个答案:

答案 0 :(得分:15)

我认为你有多个数组,字符串,并且在循环和嵌套循环之间频繁(并且难以跟踪)上下文切换,你会感到过于复杂。

以下是我鼓励您考虑采取的方法。我已经通过内联评论来解释每一步。如果其中任何一项不清楚,请在评论中告诉我,我将重新审视以提高清晰度。

(function () {

    /* Below is a regular expression that finds alphanumeric characters
       Next is a string that could easily be replaced with a reference to a form control
       Lastly, we have an array that will hold any words matching our pattern */
    var pattern = /\w+/g,
        string = "I I am am am yes yes.",
        matchedWords = string.match( pattern );

    /* The Array.prototype.reduce method assists us in producing a single value from an
       array. In this case, we're going to use it to output an object with results. */
    var counts = matchedWords.reduce(function ( stats, word ) {

        /* `stats` is the object that we'll be building up over time.
           `word` is each individual entry in the `matchedWords` array */
        if ( stats.hasOwnProperty( word ) ) {
            /* `stats` already has an entry for the current `word`.
               As a result, let's increment the count for that `word`. */
            stats[ word ] = stats[ word ] + 1;
        } else {
            /* `stats` does not yet have an entry for the current `word`.
               As a result, let's add a new entry, and set count to 1. */
            stats[ word ] = 1;
        }

        /* Because we are building up `stats` over numerous iterations,
           we need to return it for the next pass to modify it. */
        return stats;

    }, {} );

    /* Now that `counts` has our object, we can log it. */
    console.log( counts );

}());

答案 1 :(得分:11)

这是一个JavaScript函数,用于获取句子中每个单词的频率:

function wordFreq(string) {
    var words = string.replace(/[.]/g, '').split(/\s/);
    var freqMap = {};
    words.forEach(function(w) {
        if (!freqMap[w]) {
            freqMap[w] = 0;
        }
        freqMap[w] += 1;
    });

    return freqMap;
}

它将返回单词到字数的哈希值。例如,如果我们这样运行它:

console.log(wordFreq("I am the big the big bull."));
> Object {I: 1, am: 1, the: 2, big: 2, bull: 1}

您可以使用Object.keys(result).sort().forEach(result) {...}迭代这些字词。所以我们可以像这样挂钩:

var freq = wordFreq("I am the big the big bull.");
Object.keys(freq).sort().forEach(function(word) {
    console.log("count of " + word + " is " + freq[word]);
});

哪个会输出:

count of I is 1
count of am is 1
count of big is 2
count of bull is 1
count of the is 2

JSFiddle:http://jsfiddle.net/ah6wsbs6/

这是ES6中的wordFreq函数:

function wordFreq(string) {
  return string.replace(/[.]/g, '')
    .split(/\s/)
    .reduce((map, word) =>
      Object.assign(map, {
        [word]: (map[word])
          ? map[word] + 1
          : 1,
      }),
      {}
    );
}

JSFiddle:http://jsfiddle.net/r1Lo79us/

答案 2 :(得分:1)

以下是您自己代码的更新版本......

<!DOCTYPE html>
<html>
<head>
<title>string frequency</title>
<style type="text/css">
#text{
    width:250px;
}
</style>
</head>

<body >

<textarea id="txt" cols="25" rows="3" placeholder="add your text here">   </textarea></br>
<button type="button" onclick="search()">search</button>

    <script >

        function search()
        {
            var data=document.getElementById('txt').value;
            var temp=data;
            var words=new Array();
            words=temp.split(" ");

            var unique = {};


            for (var i = 0; i < words.length; i++) {
                var word = words[i];
                console.log(word);

                if (word in unique)
                {
                    console.log("word found");
                    var count  = unique[word];
                    count ++;
                    unique[word]=count;
                }
                else
                {
                    console.log("word NOT found");
                    unique[word]=1;
                }
            }
            console.log(unique);
        }

    </script>

</body>

我认为你的循环过于复杂。此外,尝试在仍然对单词数组进行第一次传递时产生最终计数肯定会失败,因为在检查数组中的每个单词之前,您无法测试唯一性。

我使用Javascript对象作为关联数组,而不是所有计数器,因此我们可以存储每个唯一的单词,以及它出现次数的计数。

然后,一旦我们退出循环,我们就可以看到最终结果。

此外,此解决方案不使用正则表达式;)

我还要补充说,根据空格计算单词非常困难。在此代码中,&#34;一,二,一&#34;将导致&#34; one,&#34;和&#34;一个&#34;作为不同的,独特的单词。

答案 3 :(得分:1)

虽然这里的两个答案都是正确的,但也许更好,但没有一个解决OP的问题(他的代码有什么问题)。

OP代码的问题在于:

if(f==0){
    count[i]=1;
    uniqueWords[i]=words[i];
}

在每个新单词(唯一单词)上,代码会将该单词添加到uniqueWords,该单词位于words中。因此uniqueWords数组存在空白。这是一些undefined值的原因。

尝试打印uniqueWords。它应该给出类似的东西:

  

[&#34;这&#34;,&#34;是&#34;,&#34; anil&#34;,4:&#34; kum&#34;,5:&#34;&# 34]

请注意索引3没有元素。

最后计数的打印应该在处理words数组中的所有单词之后。

此处有更正版本:

function search()
{
    var data=document.getElementById('txt').value;
    var temp=data;
    var words=new Array();
    words=temp.split(" ");
    var uniqueWords=new Array();
    var count=new Array();


    for (var i = 0; i < words.length; i++) {
        //var count=0;
        var f=0;
        for(j=0;j<uniqueWords.length;j++){
            if(words[i]==uniqueWords[j]){
                count[j]=count[j]+1;
                //uniqueWords[j]=words[i];
                f=1;
            }
        }
        if(f==0){
            count[i]=1;
            uniqueWords[i]=words[i];
        }
    }
    for ( i = 0; i < uniqueWords.length; i++) {
        if (typeof uniqueWords[i] !== 'undefined')
            console.log("count of "+uniqueWords[i]+" - "+count[i]);       
    }
}

我刚刚将处理循环中的计数打印移动到一个新循环并添加了if not undefined检查。

小提琴:https://jsfiddle.net/cdLgaq3a/

答案 4 :(得分:0)

对于slightly better efficiency,我将使用Sampson的match-reduce方法。这是它的修改后的版本,可以立即投入生产。它不是完美的,但它应该涵盖绝大多数情况(即“足够好”)。

function calcWordFreq(s) {
  // Normalize
  s = s.toLowerCase();
  // Strip quotes and brackets
  s = s.replace(/["“”(\[{}\])]|\B['‘]([^'’]+)['’]/g, '$1');
  // Strip dashes and ellipses
  s = s.replace(/[‒–—―…]|--|\.\.\./g, ' ');
  // Strip punctuation marks
  s = s.replace(/[!?;:.,]\B/g, '');
  return s.match(/\S+/g).reduce(function(oFreq, sWord) {
    if (oFreq.hasOwnProperty(sWord)) ++oFreq[sWord];
    else oFreq[sWord] = 1;
    return oFreq;
  }, {});
}

calcWordFreq('A ‘bad’, “BAD” wolf-man...a good ol\' spook -- I\'m frightened!')返回

{
  "a": 2
  "bad": 2
  "frightened": 1
  "good": 1
  "i'm": 1
  "ol'": 1
  "spook": 1
  "wolf-man": 1
}

答案 5 :(得分:0)

const sentence = 'Hi my friend how are you my friend';

const countWords = (sentence) => {
    const convertToObject = sentence.split(" ").map( (i, k) => {
        return {
          element: {
              word: i,
              nr: sentence.split(" ").filter(j => j === i).length + ' occurrence',
          }

      }
  });
    return Array.from(new Set(convertToObject.map(JSON.stringify))).map(JSON.parse)
};

console.log(countWords(sentence));