为什么我的黄林实施如此缓慢?

时间:2015-10-05 10:32:11

标签: c++ algorithm optimization merge

我写了一个Hwang-Lin合并算法(A simple algorithm for merging two disjoint linearly ordered sets)的实现。虽然它有效,但结果却很慢。这是我的实施:

template<typename Iter>
void hwang_lin_buf(Iter first1, Iter first2, Iter last, Iter buf_start) {
    ptrdiff_t m = first2 - first1, n = last - first2, buff_size = m;
    swap_blocks(first1, buf_start, m);
    block_rotate(first1, first2, last);// B first1 - first1 + n, A buf_start - buf_start + m
    Iter a = buf_start, b = first1, buffer = first1 + n, buffer_end = buffer + buff_size;
    while (m > 0 && n > 0) {
        if (n > m) {
            size_t alpha = (size_t) floor(log2((double) n / m)), x = n - pow(2, alpha) + 1;
            if (*(a + m - 1) < *(b + x - 1)) {
                block_rotate(b + x - 1, buffer, buffer_end); //pull out the set of all elements in B >= bx
                buffer = b + x - 1; //new buffer position
                buffer_end = buffer + buff_size;
                n = x - 1;
            } else {
                Iter place = binary_search(b + x - 1, b + n - 1, a + m - 1);
                if (place != b + n) {
                    block_rotate(place, buffer, buffer_end);  //pull out the set of all elements in B >= am
                    buffer = place;
                    buffer_end = buffer + buff_size;
                }
                std::swap(*(a + m - 1), *(buffer_end - 1)); //insert am
                --buff_size;
                buffer_end = buffer + buff_size;
                --m;
                n = place - b;
            }
        } else {
            size_t alpha = (size_t) floor(log2((double) m / n)), x = m - std::pow(2, alpha) + 1;
            if (*(b + n - 1) < *(a + x - 1)) {
                swap_blocks(a + x - 1, buffer_end - m + x - 1,
                            m - (x - 1)); //pull out the set of all elements in A >= ax
                buff_size -= m - (x - 1);
                buffer_end = buffer + buff_size;
                m = x - 1;
            } else {
                Iter place = binary_search(a + x - 1, a + m - 1, b + n - 1);
                ptrdiff_t c_size = a + m - place;
                swap_blocks(place, buffer_end - c_size, c_size); //pull out the set of all elements in A >= bn
                buff_size -= c_size;
                buffer_end = buffer + buff_size;
                block_rotate(b + n - 1, buffer, buffer_end); //insert bn
                --buffer;
                buffer_end = buffer + buff_size;
                --n;
                m -= c_size;
            }
        }

    }
    if (m > 0)
        swap_blocks(a, buffer, m);
}

以下是其中使用的函数:

template<typename Iter>
inline void swap_blocks(Iter pos1, Iter pos2, size_t len) {
    for (size_t i = 0; i < len; ++i)
        std::swap(*(pos1 + i), *(pos2 + i));
}

template<typename Iter>
inline Iter binary_search(Iter be, Iter en, Iter key) {//returns first iter bigger than key if nothing found
    while (be < en) {
        Iter mid = be + (en - be) / 2;

        if (*mid < *key)
            be = mid + 1;
        else
            en = mid;
    }

    if (((en == be) && (*be == *key)) || (*be > *key))
        return be;
    else
        return be + 1;
}

template<typename Iter>
inline void floating_hole(Iter pos1, Iter pos2, size_t len) {
    auto t = *(pos1 - 1);
    for (size_t i = 0; i < len; ++i) {
        *(pos1 + i - 1) = *(pos2 + i);
        *(pos2 + i) = *(pos1 + i);
    }
    *(pos1 + len - 1) = t;
}

template<typename Iter>
inline void block_rotate(Iter first1, Iter first2, Iter last) {
    ptrdiff_t i = first2 - first1, j = last - first2;
    if (i == 0 || j == 0)
        return;
    Iter p = first1 + i;
    while (i != j) {
        if (i > j) {
            swap_blocks(p - i, p, j);
            i -= j;
        } else {
            swap_blocks(p - i, p + j - i, i);
            j -= i;
        }
    }
    swap_blocks(p - i, p, i);
}

我运行测试以将我的实现与std::inplace_merge进行比较。我希望我的代码有点慢,但我得到了以下结果:将100000(50000的两部分)int64_t的向量与-O3优化合并:

Hwang-Lin:5.72054 s

std :: inplace_merge:0.0010003 s

std::inplace_merge 快5000倍!我错过了什么吗?

0 个答案:

没有答案