我们说我有一个数组:
[
"I want **a dog**",
"**A dog** is here",
"Pet **a dog**",
"A **red cat**",
"**red cat** is cute"
...
]
如何找出重复的词组是什么,而不只是单词? 例如,我喜欢" 狗"和" 红猫"被退回。
我发现的大多数现有帖子只是关于获取单个单词,而不是短语(多个单词)。
答案 0 :(得分:0)
这不是javascript函数的最终版本,可以进一步优化。也可能需要进行少量更改,但它可以满足您的要求。
function GetPhrases(stringsArray) {
//Array to split your string into words.
var jaggedArray = [];
//Array to keep indexes of strings where 2 matching words are found together.
var newArray = [];
var phrases = [];
//Loop through your array
for (var ic = 0; ic < stringsArray.length; ic++) {
//Convert every item to array of strings
var items = (stringsArray[ic]).split(" ");
for (var it = 0; it < items.length; it++)
items[it] = items[it].toLowerCase();
//Push the array of words to main array
jaggedArray.push(items);
}
//console.log(jaggedArray);
// Loop through the main array
for (var iLoop = 0; iLoop < jaggedArray.length; iLoop++) {
// For every item in main array, loop through words in that item.
for (var ik = 0; ik < jaggedArray[iLoop].length; ik++) {
var currentWord = jaggedArray[iLoop][ik];
// For every word, check its existence in the main array in all items coming after current item.
for (var il = iLoop + 1; il < jaggedArray.length; il++) {
// Find the index in the string.
var indexOfFind = jaggedArray[il].indexOf(currentWord);
if (indexOfFind > 0) {
// if matching index is more than 0, find if the word before this word also matches.
var indexofPrevWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik - 1]);
if ((indexofPrevWord >= 0) && (indexofPrevWord == (indexOfFind - 1)))
if (newArray.indexOf(il + " - " + iLoop) < 0)
newArray.push(il + " - " + iLoop);
// if matching index is more than 0, find if the word after this word also matches.
var indexofNextWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik + 1]);
if (indexofNextWord >= 0 && (indexofNextWord == (indexOfFind + 1)))
if (newArray.indexOf(il + " - " + iLoop) < 0)
newArray.push(il + " - " + iLoop);
}
else if (indexOfFind = 0) {
// if matching index is more than 0, find if the word after this word also matches.
var indexofNewWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik + 1]);
if (indexofNewWord >= 0 && (indexofNewWord == (indexOfFind + 1)))
if (newArray.indexOf(il + " - " + iLoop) < 0)
newArray.push(il + " - " + iLoop);
}
}
}
}
//newArray will store indexes of those string arrays in jagged array which has a matching sequence of atleast 2 words.
//console.log(newArray);
//Loop through newArray
for (var itl = 0; itl < newArray.length; itl++) {
var item = newArray[itl];
var values = item.split(" - ");
var firstArrayItem = jaggedArray[values[0]];
var secondArrayItem = jaggedArray[values[1]];
var phraseStartPoint = [];
//for every word in firstItem
for (var iy = 0; iy < firstArrayItem.length - 1; iy++) {
var t = iy + 1;
// check if that word and next word exist in second array
if (secondArrayItem.toString().indexOf(firstArrayItem[iy] + "," + firstArrayItem[t]) >= 0) {
// if they do exist, get the indexes of these and store in local array, if they are not there, since we do not want repeating words later.
if (phraseStartPoint.indexOf(iy) < 0)
phraseStartPoint.push(iy);
if (phraseStartPoint.indexOf(t) < 0)
phraseStartPoint.push(t);
}
}
var str = "";
// Prepare the phrase from the local array and push into phrases array, if it not exists there.
for (var ifinalLoop = 0; ifinalLoop < phraseStartPoint.length; ifinalLoop++) {
str = str + firstArrayItem[phraseStartPoint[ifinalLoop]] + " ";
}
if (phrases.indexOf(str) < 0)
phrases.push(str);
}
return phrases;
}
var stringsArray = [
"I want a dog",
"A dog is here",
"Pet a dog is cute",
"A red cat is here",
"red cat is cute"
];
var result = GetPhrases(stringsArray);
// Print the phrases array.
for (var iPhrase = 0; iPhrase < result.length; iPhrase++) {
console.log(result[iPhrase]);
}
答案 1 :(得分:0)
你给我们的信息太少了。我假设你按空格分裂。 ES6救援:)。当你正在寻找重复的短语时,集合有O(1)查找。
编辑:刚刚意识到您可以通过一些小的修改将空间复杂度降低一吨。如果你想让我这样做,请给我一个呐喊。
const buildAllPhrases = sentence => {
const splitSentence = sentence.split(" ")
const phraseSize = splitSentence.length
const allPhrases = []
for (let i = phraseSize; i > 0; i--) {
for (let y = 0; y + i <= phraseSize; y++) {
allPhrases.push(splitSentence.slice(y, y + i))
}
}
return allPhrases.map(phrase => phrase.join(" "))
}
const findRepeats = sentences => {
const allPhrases = new Set()
const repeatedPhrases = new Set()
let phrases
sentences.forEach(phrase => {
phrases = buildAllPhrases(phrase)
phrases.forEach(subPhrase => {
if (allPhrases.has(subPhrase)) {
repeatedPhrases.add(subPhrase)
} else {
allPhrases.add(subPhrase)
}
})
})
return [...repeatedPhrases]
}
const sample = [
"I want **a dog**",
"**A dog** is here",
"Pet **a dog**",
"A **red cat**",
"**red cat** is cute"
]
findRepeats(sample)
//['dog**', '**a dog**', '**a', '**red cat**', '**red', 'cat**', 'is']
答案 2 :(得分:0)
使用正则表达式,您可以检测字符串中的重复项
根据这个正则表达式:
(?:.*?)(\b\w.{3,}\b)(?:.*?)(\1)
,
只有在寻找相同模式的两倍时,它才有效
注意:您可以将3
中的{3,}
替换为任何其他整数,并查看更改。
这个参数缩小了你要寻找两次的最小弦长度。