在数组中查找重复的短语(而不仅仅是单词)

时间:2017-09-28 03:32:26

标签: javascript arrays regex

我们说我有一个数组:

[
"I want **a dog**",
"**A dog** is here",
"Pet **a dog**",
"A **red cat**",
"**red cat** is cute"
...
]

如何找出重复的词组是什么,而不只是单词? 例如,我喜欢" "和" 红猫"被退回。

我发现的大多数现有帖子只是关于获取单个单词,而不是短语(多个单词)。

3 个答案:

答案 0 :(得分:0)

这不是javascript函数的最终版本,可以进一步优化。也可能需要进行少量更改,但它可以满足您的要求。

function GetPhrases(stringsArray) {
    //Array to split your string into words.
    var jaggedArray = [];

    //Array to keep indexes of strings where 2 matching words are found together.
    var newArray = [];

    var phrases = [];

    //Loop through your array
    for (var ic = 0; ic < stringsArray.length; ic++) {
        //Convert every item to array of strings
        var items = (stringsArray[ic]).split(" ");
        for (var it = 0; it < items.length; it++)
            items[it] = items[it].toLowerCase();

        //Push the array of words to main array  
        jaggedArray.push(items);
    }
    //console.log(jaggedArray);

    // Loop through the main array
    for (var iLoop = 0; iLoop < jaggedArray.length; iLoop++) {
        // For every item in main array, loop through words in that item.
        for (var ik = 0; ik < jaggedArray[iLoop].length; ik++) {
            var currentWord = jaggedArray[iLoop][ik];
            // For every word, check its existence in the main array in all items coming after current item.
            for (var il = iLoop + 1; il < jaggedArray.length; il++) {
                // Find the index in the string.
                var indexOfFind = jaggedArray[il].indexOf(currentWord);


                if (indexOfFind > 0) {
                    // if matching index is more than 0, find if the word before this word also matches. 
                    var indexofPrevWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik - 1]);
                    if ((indexofPrevWord >= 0) && (indexofPrevWord == (indexOfFind - 1)))
                        if (newArray.indexOf(il + " - " + iLoop) < 0)
                            newArray.push(il + " - " + iLoop);

                    // if matching index is more than 0, find if the word after this word also matches. 
                    var indexofNextWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik + 1]);
                    if (indexofNextWord >= 0 && (indexofNextWord == (indexOfFind + 1)))
                        if (newArray.indexOf(il + " - " + iLoop) < 0)
                            newArray.push(il + " - " + iLoop);
                }
                else if (indexOfFind = 0) {
                    // if matching index is more than 0, find if the word after this word also matches. 
                    var indexofNewWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik + 1]);
                    if (indexofNewWord >= 0 && (indexofNewWord == (indexOfFind + 1)))
                        if (newArray.indexOf(il + " - " + iLoop) < 0)
                            newArray.push(il + " - " + iLoop);
                }
            }
        }

    }
    //newArray will store indexes of those string arrays in jagged array which has a matching sequence of atleast 2 words.
    //console.log(newArray);
    //Loop through newArray
    for (var itl = 0; itl < newArray.length; itl++) {
        var item = newArray[itl];
        var values = item.split(" - ");

        var firstArrayItem = jaggedArray[values[0]];
        var secondArrayItem = jaggedArray[values[1]];
        var phraseStartPoint = [];

        //for every word in firstItem
        for (var iy = 0; iy < firstArrayItem.length - 1; iy++) {

            var t = iy + 1;
            // check if that word and next word exist in second array
            if (secondArrayItem.toString().indexOf(firstArrayItem[iy] + "," + firstArrayItem[t]) >= 0) {
                // if they do exist, get the indexes of these and store in local array, if they are not there, since we do not want repeating words later. 
                if (phraseStartPoint.indexOf(iy) < 0)
                    phraseStartPoint.push(iy);
                if (phraseStartPoint.indexOf(t) < 0)
                    phraseStartPoint.push(t);
            }
        }

        var str = "";
        // Prepare the phrase from the local array and push into phrases array, if it not exists there.
        for (var ifinalLoop = 0; ifinalLoop < phraseStartPoint.length; ifinalLoop++) {
            str = str + firstArrayItem[phraseStartPoint[ifinalLoop]] + " ";
        }

        if (phrases.indexOf(str) < 0)
            phrases.push(str);

    }
    return phrases;
}

var stringsArray = [
"I want a dog",
"A dog is here",
"Pet a dog is cute",
"A red cat is here",
"red cat is cute"
];

var result = GetPhrases(stringsArray);
// Print the phrases array.
for (var iPhrase = 0; iPhrase < result.length; iPhrase++) {
    console.log(result[iPhrase]);
}

答案 1 :(得分:0)

你给我们的信息太少了。我假设你按空格分裂。 ES6救援:)。当你正在寻找重复的短语时,集合有O(1)查找。

编辑:刚刚意识到您可以通过一些小的修改将空间复杂度降低一吨。如果你想让我这样做,请给我一个呐喊。

const buildAllPhrases = sentence => {
    const splitSentence = sentence.split(" ")
    const phraseSize = splitSentence.length
    const allPhrases = []
    for (let i = phraseSize; i > 0; i--) {
        for (let y = 0; y + i <= phraseSize; y++) {
            allPhrases.push(splitSentence.slice(y, y + i))
        }
    }
    return allPhrases.map(phrase => phrase.join(" "))
}

const findRepeats = sentences => {
    const allPhrases = new Set()
    const repeatedPhrases = new Set()
    let phrases
    sentences.forEach(phrase => {
        phrases = buildAllPhrases(phrase)
        phrases.forEach(subPhrase => {
            if (allPhrases.has(subPhrase)) {
                repeatedPhrases.add(subPhrase)
            } else {
                allPhrases.add(subPhrase)
            }
        })
    })
    return [...repeatedPhrases]
}

const sample = [
"I want **a dog**",
"**A dog** is here",
"Pet **a dog**",
"A **red cat**",
"**red cat** is cute"
]

findRepeats(sample)
//['dog**', '**a dog**', '**a', '**red cat**', '**red', 'cat**', 'is']

答案 2 :(得分:0)

使用正则表达式,您可以检测字符串中的重复项 根据这个正则表达式: (?:.*?)(\b\w.{3,}\b)(?:.*?)(\1)

只有在寻找相同模式的两倍时,它才有效 注意:您可以将3中的{3,}替换为任何其他整数,并查看更改。 这个参数缩小了你要寻找两次的最小弦长度。