我有一大块文字,我想找出最常用的单词(除了少数,如“the”,“a”,“and”等)。
如何搜索这个文本块以获取最常用的单词?
答案 0 :(得分:24)
您应该将字符串拆分为单词,然后遍历单词并为每个单词增加一个计数器:
var wordCounts = { };
var words = str.split(/\b/);
for(var i = 0; i < words.length; i++)
wordCounts["_" + words[i]] = (wordCounts["_" + words[i]] || 0) + 1;
"_" +
允许它处理已经是对象属性的constructor
之类的单词。
您可能希望编写words[i].toLowerCase()
以区分大小写。
答案 1 :(得分:1)
这是我的方法
let str = 'How do you do?';
console.log(findMostRepeatedWord(str)); // Result: "do"
function findMostRepeatedWord(str) {
let words = str.match(/\w+/g);
console.log(words); // [ 'How', 'do', 'you', 'do' ]
let occurances = {};
for (let word of words) {
if (occurances[word]) {
occurances[word]++;
} else {
occurances[word] = 1;
}
}
console.log(occurances); // { How: 1, do: 2, you: 1 }
let max = 0;
let mostRepeatedWord = '';
for (let word of words) {
if (occurances[word] > max) {
max = occurances[word];
mostRepeatedWord = word;
}
}
return mostRepeatedWord;
}
答案 2 :(得分:0)
来自未来,再次提出这个问题的问题,但我对解决方案的启动太早,并且标记为已回答。无论如何,它是SLaks答案的补充。
function nthMostCommon(string, ammount) {
var wordsArray = string.split(/\s/);
var wordOccurrences = {}
for (var i = 0; i < wordsArray.length; i++) {
wordOccurrences['_'+wordsArray[i]] = ( wordOccurrences['_'+wordsArray[i]] || 0 ) + 1;
}
var result = Object.keys(wordOccurrences).reduce(function(acc, currentKey) {
/* you may want to include a binary search here */
for (var i = 0; i < ammount; i++) {
if (!acc[i]) {
acc[i] = { word: currentKey.slice(1, currentKey.length), occurences: wordOccurrences[currentKey] };
break;
} else if (acc[i].occurences < wordOccurrences[currentKey]) {
acc.splice(i, 0, { word: currentKey.slice(1, currentKey.length), occurences: wordOccurrences[currentKey] });
if (acc.length > ammount)
acc.pop();
break;
}
}
return acc;
}, []);
return result;
}
答案 3 :(得分:0)
Lodash 1-liner:
const mostFrequentWord = _.maxBy(Object.values(_.groupBy(str.match(/\b(\w+)\b/g))), w => w.length)[0]
答案 4 :(得分:0)
通过此功能,您可以列出最常用的单词。此函数返回一个数组。
findMostFrequentWords = (string) => {
var wordsArray = string.split(/\s/);
var wordOccurrences = []
for (var i = 0; i < wordsArray.length; i++) {
wordOccurrences[wordsArray[i]] = (wordOccurrences[wordsArray[i]] || 0) + 1;
}
const maximum = Object.keys(wordOccurrences).reduce(function (accomulated, current) {
return wordOccurrences[current] >= wordOccurrences[accomulated] ? current : accomulated;
});
const result = []
Object.keys(wordOccurrences).map((word) => {
if (wordOccurrences[word] === wordOccurrences[maximum])
result.push(word);
})
return result
}
答案 5 :(得分:0)
我从 Gustavo Maloste 的建议开始,并添加了对粘滞词的过滤。
let str = 'Delhi is a crowded city. There are very few rich people who travel by their own vehicles. The majority of the people cannot afford to hire a taxi or a three-wheeler. They have to depend on D.T.C. buses, which are the cheapest mode of conveyance. D.T.C. buses are like blood capillaries of our body spreading all over in Delhi. One day I had to go to railway station to receive my uncle. I had to reach there by 9.30 a.m. knowing the irregularity of D.T.C. bus service; I left my home at 7.30 a.m. and reached the bus stop. There was a long queue. Everybody was waiting for the bus but the buses were passing one after another without stopping. I kept waiting for about an hour. I was feeling very restless and I was afraid that I might not be able to reach the station in time. It was 8.45. Luckily a bus stopped just in front of me. It was overcrowded but somehow I managed to get into the bus. Some passengers were hanging on the footboard, so there was no question of getting a seat. It was very uncomfortable. We were feeling suffocated. All of a sudden, an old man declared that his pocket had been picked. He accused the man standing beside him. The young man took a knife out of his pocket and waved it in the air. No body dared to catch him. I thanked God when the bus stopped at the railway station. I reached there just in time.';
//console.log(findMostRepeatedWord(str)); // Result: "do"
let occur = nthMostCommon(str, 10);
console.log(occur);
function nthMostCommon(str, amount) {
const stickyWords =[
"the",
"there",
"by",
"at",
"and",
"so",
"if",
"than",
"but",
"about",
"in",
"on",
"the",
"was",
"for",
"that",
"said",
"a",
"or",
"of",
"to",
"there",
"will",
"be",
"what",
"get",
"go",
"think",
"just",
"every",
"are",
"it",
"were",
"had",
"i",
"very",
];
str= str.toLowerCase();
var splitUp = str.split(/\s/);
const wordsArray = splitUp.filter(function(x){
return !stickyWords.includes(x) ;
});
var wordOccurrences = {}
for (var i = 0; i < wordsArray.length; i++) {
wordOccurrences['_'+wordsArray[i]] = ( wordOccurrences['_'+wordsArray[i]] || 0 ) + 1;
}
var result = Object.keys(wordOccurrences).reduce(function(acc, currentKey) {
/* you may want to include a binary search here */
for (var i = 0; i < amount; i++) {
if (!acc[i]) {
acc[i] = { word: currentKey.slice(1, currentKey.length), occurences: wordOccurrences[currentKey] };
break;
} else if (acc[i].occurences < wordOccurrences[currentKey]) {
acc.splice(i, 0, { word: currentKey.slice(1, currentKey.length), occurences: wordOccurrences[currentKey] });
if (acc.length > amount)
acc.pop();
break;
}
}
return acc;
}, []);
return result;
}