Question

我有以下数据结构：

std::vector<std::vector<std::pair <std::vector<unsigned>,std::vector<unsigned> >, unsigned > > A;

包含以下数据：

((7),(108,109)),5
((7),(108,109)),4
((7),(101,102,110)),3
((7),(101,102)),1
((7),(51)),2
((7),(51,54)),6
((7),(40,54,59)),7
((3),(108,109)),15
((3),(101,102,110)),13
((3),(101,102)),11
((3),(51)),12
((3),(51,54)),16
((3),(40,54,59)),17
((9),(108,109)),25
((9),(108,109)),24
((9),(108,109,110)),20
((9),(101,102,110)),23
((9),(111,112,120)),21
((9),(101,102)),29
((9),(51)),22
((9),(51,54)),26
((9),(40,54,59)),7
((8,2,10),(108,109)),25
((8,2,10),(108,109)),24
((8,2,10),(108,109,110)),20
((8,2,10),(101,102,110)),23
((8,2,10),(111,112,120)),21
((8,2,10),(101,102)),29
((8,2,10),(51)),22
((8,2,10),(51,54)),26
((8,2,10),(40,54,59)),7
((5,7),(108,109)),35
((5,7),(108,109)),34
((5,7),(108,109,110)),30
((5,7),(101,102,110)),33
((5,7),(111,112,120)),31
((5,7),(101,102)),39
((5,7),(51)),32
((5,7),(51,54)),36
((5,7),(40,54,59)),37

现在我想以下列方式安排我的数据：

((3),(101,102)),11
((3),(108,109)),15
((3),(101,102,110)),13
((7),(101,102)),1
((7),(108,109)),5
((7),(108,109)),4
((7),(101,102,110)),3
((9),(101,102)),29
((9),(108,109)),25
((9),(108,109)),24
((9),(101,102,110)),23
((9),(108,109,110)),20
((9),(111,112,120)),21
((5,7),(101,102)),39
((5,7),(108,109)),35
((5,7),(108,109)),34
((5,7),(101,102,110)),33
((5,7),(108,109,110)),30
((5,7),(111,112,120)),31
((8,2,10),(101,102)),29
((8,2,10),(108,109)),25
((8,2,10),(108,109)),24
((8,2,10),(101,102,110)),23
((8,2,10),(108,109,110)),20
((8,2,10),(111,112,120)),21

((3),(51)),12
((3),(51,54)),16
((3),(40,54,59)),17
((7),(51)),2
((7),(51,54)),6
((7),(40,54,59)),7
((9),(51)),22
((9),(51,54)),26
((9),(40,54,59)),7
((5,7),(51)),32
((5,7),(51,54)),36
((5,7),(40,54,59)),37
((8,2,10),(51)),22
((8,2,10),(51,54)),26
((8,2,10),(40,54,59)),7

通过首先按大小排序第一对的第一个向量＆lt;＆gt;来实现排序。在对中＆lt;＆gt;。然后按字典顺序对矢量进行排序。该对中第二对的第二个向量＆lt;＆gt;也首先按大小大小排序，然后按字典顺序排序。根据第一对的第二矢量对整个数据进行聚类。在对中＆lt;＆gt;向量A.即用户指定根据以下方式将A的所有元素聚集在一起：（（101,102），（108,109），（101,102,110），（108,109,110），（111,112,120））和（（51），（51,54）），（40,54,59））。

我知道可以按（i）排序。第一尺寸和（ii）。然后按字典顺序排序矢量。使用以下代码：

bool mySort(const pair<vector<unsigned>,vector<unsigned> > &a , const pair<vector<unsigned>,vector<unsigned> > &b)
{
    if (a.first.size() == b.first.size()) {
        //If sizes of the vectors are equal
        //Sort the graph lexicographically. 
        return std::lexicographical_compare(a.first.begin(),a.first.end(),b.first.begin(),b.first.end());pair<vector<unsigned>,vector<unsigned> > a
    } else {
        //Sort by size.
        return a.first.size() < b.first.size();
    }
}
int main()
{
    std::vector<std::pair<std::vector<unsigned>,std::vector<unsigned> > > a;
    std::sort(a.begin(),a.end(),mySort);
}

但是我没有得到那个我怎么能在排序时将第一对A向量的第二个向量聚集在一起（（i）。按大小（ii）。然后是词典排序）。有人可以帮我解决。

此外，我拥有的矢量A的大小非常大。因此，任何有效的解决方案都将是锦上添花。

我使用的gcc版本是：gcc（Ubuntu / Linaro 4.6.3-1ubuntu5）4.6.3

如果可以使用任何数据结构在c或c ++中实现相同的效果，我很好（因为我只对我指定的特定顺序感兴趣）。

编辑：生成输入的代码：

std::vector<std::pair<std::vector<unsigned>, std::vector<unsigned> > > a; 
vector<unsigned> b; vector<unsigned> p;
b.push_back(7); p.push_back(108); p.push_back(109);
a.push_back(make_pair(b,p));
p.clear();
p.push_back(101); p.push_back(102); p.push_back(110);
a.push_back(make_pair(b,p));
p.clear();
p.push_back(101); p.push_back(102);
a.push_back(make_pair(b,p));
p.clear();
p.push_back(51);
a.push_back(make_pair(b,p));
p.clear();
p.push_back(51); p.push_back(54);
a.push_back(make_pair(b,p));
p.clear();
p.push_back(40); p.push_back(54); p.push_back(59);
a.push_back(make_pair(b,p));
b.clear(); p.clear();
b.push_back(3);
p.push_back(108); p.push_back(109); 
a.push_back(make_pair(b,p));
p.clear();
p.push_back(101); p.push_back(102); p.push_back(110);
a.push_back(make_pair(b,p));
p.clear();
p.push_back(101); p.push_back(102); 
a.push_back(make_pair(b,p));
p.clear();
p.push_back(51); 
a.push_back(make_pair(b,p));
p.clear();
p.push_back(51); p.push_back(54);
a.push_back(make_pair(b,p));
p.clear();
p.push_back(40); p.push_back(54); p.push_back(59);
a.push_back(make_pair(b,p));

Answer 1

您可以先进行分区（std::partition），然后对每个群集进行排序＆＃39; （std::sort）。

以下可能会有所帮助：

using vec_pair = std::pair<std::vector<unsigned>, std::vector<unsigned>>;

std::set<std::vector<unsigned>> wanted = {
    {101,102}, {108,109}, {101,102,110}, {108,109,110}, {111,112,120}};

auto mid = std::partition(a.begin(), a.end(), [&](const vec_pair& p){
    return wanted.count(p.second) != 0;
});

std::sort(a.begin(), mid, mySort); // First cluster
std::sort(mid, a.end(), mySort);   // Second cluster

Live example

lambda可以在C ++ 03中用这个函子代替：

struct allowed
{
public:
    explicit allowed(const std::set<std::vector<unsigned>>& wanted) :  wanted(wanted) {}

    bool operator () (const vec_pair& p) const {
        return wanted.count(p.second) != 0;
    }
private:
    const std::set<std::vector<unsigned>>& wanted;
};

并且代码变为：

std::vector<vec_pair>::iterator mid = std::partition(a.begin(), a.end(), allowed(wanted));

Answer 2

我删除了我的第一个答案，因为结果不正确。

好的，所以你的第一个问题显然是问题规范。你想要什么以及需要发生什么是非常不清楚的。

所以这是我对你的问题的理解：您有一组由以下数据结构定义的数据：

typedef std::vector<unsigned> listType;
typedef std::pair < listType, listType > vectorPair;
typedef std::pair< vectorPair, unsigned> recordType;
std::vector< recordType >  A

问题是根据以下优先级对A进行排序：

根据“群集”排序（请参阅下面的定义）
第一个列表（recordType.first.first）首先按大小排序，然后按字典顺序排序
第二个列表（recordType.first.second）首先按大小排序，然后按字典顺序排序
最后使用recordType的第二个值（recordType.second）？

“clusters”由recordType.first.second中值的特定集合定义。所以对于这个例子集群1：（（101,102），（108,109），（101,102,110），（108,109,110），（111,112,120））和集群2：（（51），（51,54），（40,54,59））

好了，现在定义是更好的排序，通过编写正确的排序函数，这变得相当容易。

排序的速度主要取决于您确定群集数量的速度。为此，我使用了unordered_map进行近乎恒定的时间查找。（我的哈希函数假设第二个列表中的值小于256，你将不会有2个，对于实际应用，你可能需要更好的哈希函数）

以下是该计划：

using namespace std;


typedef std::vector<unsigned> listType;
typedef std::pair < listType, listType > vectorPair;
typedef std::pair< vectorPair, unsigned> recordType;
std::vector< recordType >  A = {
    { { { 7 }, { 108, 109 } }, 5 },
    { { { 7 }, { 108, 109 } }, 4 },
    { { { 7 }, { 101, 102, 110 } }, 3 },
    { { { 7 }, { 101, 102 } }, 1 },
    { { { 7 }, { 51 } }, 2 },
    { { { 7 }, { 51, 54 } }, 6 },
    { { { 7 }, { 40, 54, 59 } }, 7 },
    { { { 3 }, { 108, 109 } }, 15 },
    { { { 3 }, { 101, 102, 110 } }, 13 },
    { { { 3 }, { 101, 102 } }, 11 },
    { { { 3 }, { 51 } }, 12 },
    { { { 3 }, { 51, 54 } }, 16 },
    { { { 3 }, { 40, 54, 59 } }, 17 },
    { { { 9 }, { 108, 109 } }, 25 },
    { { { 9 }, { 108, 109 } }, 24 },
    { { { 9 }, { 108, 109, 110 } }, 20 },
    { { { 9 }, { 101, 102, 110 } }, 23 },
    { { { 9 }, { 111, 112, 120 } }, 21 },
    { { { 9 }, { 101, 102 } }, 29 },
    { { { 9 }, { 51 } }, 22 },
    { { { 9 }, { 51, 54 } }, 26 },
    { { { 9 }, { 40, 54, 59 } }, 7 },
    { { { 8, 2, 10 }, { 108, 109 } }, 25 },
    { { { 8, 2, 10 }, { 108, 109 } }, 24 },
    { { { 8, 2, 10 }, { 108, 109, 110 } }, 20 },
    { { { 8, 2, 10 }, { 101, 102, 110 } }, 23 },
    { { { 8, 2, 10 }, { 111, 112, 120 } }, 21 },
    { { { 8, 2, 10 }, { 101, 102 } }, 29 },
    { { { 8, 2, 10 }, { 51 } }, 22 },
    { { { 8, 2, 10 }, { 51, 54 } }, 26 },
    { { { 8, 2, 10 }, { 40, 54, 59 } }, 7 },
    { { { 5, 7 }, { 108, 109 } }, 35 },
    { { { 5, 7 }, { 108, 109 } }, 34 },
    { { { 5, 7 }, { 108, 109, 110 } }, 30 },
    { { { 5, 7 }, { 101, 102, 110 } }, 33 },
    { { { 5, 7 }, { 111, 112, 120 } }, 31 },
    { { { 5, 7 }, { 101, 102 } }, 39 },
    { { { 5, 7 }, { 51 } }, 32 },
    { { { 5, 7 }, { 51, 54 } }, 36 },
    { { { 5, 7 }, { 40, 54, 59 } }, 37 } };




#define MAXVALUE 256
struct myHash
{
    size_t  operator()(const listType& key)
    {
        size_t hash = 0;
        for (unsigned value : key)
        {
            hash = (hash * MAXVALUE) + value;
        }
        return hash;
    }
};

typedef std::unordered_map<listType, int, myHash> clusterHash;

clusterHash clusterNumbers;



int calculateClusterNumber(const listType & list)
{   
    //example code (replace with what you need here)
    clusterHash::const_iterator i = clusterNumbers.find(list);
    if (i == clusterNumbers.end())
        return -1; //not found, return default
    else
        return i->second;
}

bool mySort(const recordType &a, const recordType &b)
{
    //on highest level sort according to cluster number
    int clusterA = calculateClusterNumber(a.first.second);
    int clusterB = calculateClusterNumber(b.first.second);
    if (clusterA == clusterB)
    {
        const std::vector<unsigned> & aa = a.first.first;
        const std::vector<unsigned> & bb = b.first.first;
        //next level of sorting is by size of first vector
        if (aa.size() == bb.size())
        {
            //now do a lexicographically compare of the vectors

            bool r1 = std::lexicographical_compare(aa.begin(), aa.end(), bb.begin(), bb.end());         
            if (r1) return true;
            bool r2 = std::lexicographical_compare(bb.begin(), bb.end(), aa.begin(), aa.end());
            if (r2) return false;

            //at this stage they are equal so continue to compare the second set of vectors: (first by size
            const std::vector<unsigned> & aa2 = a.first.second;
            const std::vector<unsigned> & bb2 = b.first.second;
            if (aa2.size() == bb2.size())
            {

                bool r1 = std::lexicographical_compare(aa2.begin(), aa2.end(), bb2.begin(), bb2.end());
                if (r1) return true;
                bool r2 = std::lexicographical_compare(bb2.begin(), bb2.end(), aa2.begin(), aa2.end());
                if (r2) return false;


                //if they also similar the last comparison is for the second value of the top-level pair in record:
                return a.second < b.second;

            }
            else
            {
                return aa2.size() < bb2.size();
            }

        }
        else
        {
            return aa.size() < bb.size();
        }


    }
    else
    {
        //cluster ordering
        return clusterA < clusterB;
    }
    return false;
}


int _tmain(int argc, _TCHAR* argv[])
{

    //setup the clusters:
    clusterNumbers.insert({ {101,102}, 1 });
    clusterNumbers.insert({ { 108, 109 }, 1 });
    clusterNumbers.insert({ { 101, 102, 110 }, 1 });
    clusterNumbers.insert({ { 108, 109, 110 }, 1 });
    clusterNumbers.insert({ { 111, 112, 120 }, 1 });
    clusterNumbers.insert({ { 51 }, 2 });
    clusterNumbers.insert({ { 51, 54 }, 2 });
    clusterNumbers.insert({ { 40,54,59 }, 2 });

    std::sort(A.begin(), A.end(), mySort);

    for (recordType & r : A)
    {
        auto printList = [](const listType & l) {for (unsigned u : l)std::cout << u << ","; };

        std::cout << "(";
        printList(r.first.first);
        std::cout << "),(";     
        printList(r.first.second);
        std::cout << ")," << r.second <<  std::endl;
    }



    char c;
    cin >> c;

    return 0;
}

就速度而言，这将接近于O（n.log（n））但是对于非常大的数组，所有lexicographical_compare函数都可能会使事情变慢。

看看做一个线性O（n）分区或只有群集的速度是否会更快会很有趣，我怀疑它是否真的没有帮助。然而，稍微有用的是预先计算簇数，但这需要您修改数据结构。

最后从我的搜索中可以看出GCC 4.6确实支持rvalues，所以我假设在搜索内部进行交换会很有效，并且不会产生大量的memcpy。

我已经对此进行了测试，似乎吐出了正确的结果。（我正在使用Visual Studio 2013）

对数组或向量进行排序和聚类

2 个答案: