考虑使用以下函数复制图像中的行:
void DuplicateRows(char* image_in, char *image_out, int width, int height)
{
for(int row = 0; row < height; i++)
{
memcpy(image_out + (2 * row)*width, image_in + row*width, width);
memcpy(image_out + (2 * row + 1)*width, image_in + row*width, width);
}
}
当我尝试将图像分割成多个切片并将每个切片分配给单独的线程(例如,从0-539到Threads1,540-1079 - Thread2的行)时,运行时间随着线程数的增加而恶化。 这有解释吗? (我怀疑瓶颈是序列化的内存访问)
更详细:
我运行的测试如下(它没有2个memcpy-s,但这没关系,这个例子只是为了证明有用):
#include <vector>
#include <thread>
#include <functional>
#include <condition_variable>
#include <mutex>
#include <iostream>
#include <chrono>
const int height = 1080;
const int width = 3840;
condition_variable cv;
mutex mu;
int finished;
void execute(vector<unsigned char>&vec_in, vector<unsigned char>& vec_out, int factor)
{
auto src_row_ptr = &vec_in[0];
auto dst_row_ptr = &vec_out[0];
for(int i = 0; i<height/factor; i++)
{
memcpy(dst_row_ptr, src_row_ptr, width);
src_row_ptr+= width;
dst_row_ptr+= width;
}
unique_lock<mutex> lock(mu);
finished++;
lock.unlock();
cv.notify_one();
}
void check1thread()
{
using namespace std::chrono;
finished =0;
cout<<"Checking 1 thread ... \n";
vector<unsigned char> vec1(height * width, 1);
vector<unsigned char> vec1_res(height * width ,0);
auto tm0 = high_resolution_clock::now();
auto src_row_ptr = &vec1[0];
auto dst_row_ptr = &vec1_res[0];
for(int i = 0; i<height; i++)
{
memcpy(dst_row_ptr, src_row_ptr, width);
src_row_ptr+= width;
dst_row_ptr+= width;
}
auto tm1 = high_resolution_clock::now();
cout<<"work done\n";
cout<<duration_cast<microseconds>(tm1-tm0).count() << " microseconds passed \n";
cin.get();
}
void check2threads()
{
using namespace std::chrono;
finished =0;
cout<<"Checking 2 thread ... \n";
vector<unsigned char> vec1(height/2 * width, 1);
vector<unsigned char> vec1_res(height/2 * width ,0);
vector<unsigned char> vec2(height/2 * width, 1);
vector<unsigned char> vec2_res(height/2 * width, 0);
auto tm0 = high_resolution_clock::now();
thread t1(execute, std::ref(vec1), std::ref(vec1_res) ,2 );
thread t2(execute, std::ref(vec2), std::ref(vec2_res) ,2 );
unique_lock<mutex> ul(mu);
cv.wait(ul, [](){return finished == 2;} );
auto tm1 = high_resolution_clock::now();
cout<<"work done\n";
cout<<duration_cast<microseconds>(tm1-tm0).count() << " microseconds passed \n";
t1.join();
t2.join();
}
int main()
{
check1thread();
check2threads();
cin.get();
}