Question

考虑使用以下函数复制图像中的行：

void DuplicateRows(char* image_in, char *image_out, int width, int height)
{
    for(int row = 0; row < height; i++)
    {
         memcpy(image_out + (2 * row)*width, image_in + row*width, width);
         memcpy(image_out + (2 * row + 1)*width, image_in + row*width, width);
    }
}

当我尝试将图像分割成多个切片并将每个切片分配给单独的线程（例如，从0-539到Threads1,540-1079 - Thread2的行）时，运行时间随着线程数的增加而恶化。这有解释吗？（我怀疑瓶颈是序列化的内存访问）

更详细：

我运行的测试如下（它没有2个memcpy-s，但这没关系，这个例子只是为了证明有用）：

#include <vector>
#include <thread>
#include <functional>
#include <condition_variable>
#include <mutex>
#include <iostream>
#include <chrono>

const int height = 1080;
const int width = 3840;

condition_variable cv;
mutex mu;
int finished;
void execute(vector<unsigned char>&vec_in, vector<unsigned char>& vec_out, int factor)
{
    auto src_row_ptr = &vec_in[0];
    auto dst_row_ptr = &vec_out[0];

    for(int i = 0; i<height/factor; i++)
    {
        memcpy(dst_row_ptr, src_row_ptr, width);

        src_row_ptr+= width;
        dst_row_ptr+= width;
    }

    unique_lock<mutex> lock(mu);

    finished++;

    lock.unlock();
    cv.notify_one();
}   


void check1thread()
{
    using namespace std::chrono;
    finished =0;
    cout<<"Checking 1 thread ... \n";
    vector<unsigned char> vec1(height * width, 1);
    vector<unsigned char> vec1_res(height * width ,0);

    auto tm0 = high_resolution_clock::now();
    auto src_row_ptr = &vec1[0];
    auto dst_row_ptr = &vec1_res[0];

    for(int i = 0; i<height; i++)
    {
        memcpy(dst_row_ptr, src_row_ptr, width);

        src_row_ptr+= width;
        dst_row_ptr+= width;
    }

    auto tm1 = high_resolution_clock::now();
    cout<<"work done\n";

    cout<<duration_cast<microseconds>(tm1-tm0).count() << " microseconds passed \n";

    cin.get();

}

void check2threads()
{
    using namespace std::chrono;
    finished =0;
    cout<<"Checking 2 thread ... \n";
    vector<unsigned char> vec1(height/2 * width, 1);
    vector<unsigned char> vec1_res(height/2 * width ,0);

    vector<unsigned char> vec2(height/2 * width, 1);
    vector<unsigned char> vec2_res(height/2 * width, 0);

    auto tm0 = high_resolution_clock::now();

    thread t1(execute, std::ref(vec1), std::ref(vec1_res) ,2 );
    thread t2(execute, std::ref(vec2), std::ref(vec2_res) ,2 );

    unique_lock<mutex> ul(mu);
    cv.wait(ul, [](){return finished == 2;} );

    auto tm1 = high_resolution_clock::now();
    cout<<"work done\n";

    cout<<duration_cast<microseconds>(tm1-tm0).count() << " microseconds passed \n";


    t1.join();
    t2.join();
}


int main()
{
    check1thread();
    check2threads();
    cin.get();
}

memcpy（）的性能随着线程数的增加而恶化

0 个答案: