我正在编写代码,其中一个线程生成整数,另一个线程使用它们:
#include <mutex>
#include <condition_variable>
#include <thread>
namespace custom {
template<class T, unsigned N>
class factory {
T qu[N+1];
unsigned push_ptr, pop_ptr;
std::mutex lk;
std::condition_variable cv_push, cv_pop;
inline int count() {
return push_ptr>pop_ptr?pop_ptr+N+1-push_ptr:pop_ptr-push_ptr;
}
public:
void push(T x) {
std::unique_lock ulk(lk);
cv_push.wait(ulk, [this](){return count()<N;});
qu[push_ptr++] = x;
if (push_ptr>N) push_ptr=0;
ulk.unlock();
cv_pop.notify_one();
}
T pop() {
std::unique_lock ulk(lk);
cv_pop.wait(ulk, [this](){return count()>0;});
T ret = qu[pop_ptr++];
if (pop_ptr>N) pop_ptr=0;
ulk.unlock();
cv_push.notify_one();
return ret;
}
};
}
custom::factory<int, 64> factory_buffer;
const int N = 1000000;
int S=0;
void fun1() {
for (int i=0; i<N; ++i) S+=factory_buffer.pop();
}
int main() {
std::thread th1(fun1);
for (int i=0; i<N; ++i) factory_buffer.push(i);
th1.join();
}
运行结果表明,每对发布和获取的成本为6.6us,略低于使用的1个内核。然而,使用MPI似乎表现更好:
#include <stdio.h>
#include <stdlib.h>
#include <thread>
#include <mutex>
#define lock_range(...) if(custom::__autolock lock_range_temp=(__VA_ARGS__))
namespace custom { // Didn't know unique_lock does this thing
template<class T>
struct __autolock {
T& lock;
__autolock(T& lock): lock(lock) {
lock.lock();
}
~__autolock() {
lock.unlock();
}
operator bool() {return 1;}
};
}
#include <atomic>
#include <time.h>
#include <mpi.h>
#include <string.h>
#define debugflag 0
std::mutex mu;
int main (int argc, char *argv[]) {
using std::thread;
int myrank,i,nprocs;
const int N = 100000000;
// 93114ms for N=1e8
{int provided;
MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE, &provided);
if(provided != MPI_THREAD_MULTIPLE) { // My env only support MPI_THREAD_SERIALIZED
printf("MPI do not Support Multiple thread: %d\n", provided);
//exit(0);
} }
//MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank (MPI_COMM_WORLD, &myrank);
MPI_Status status;
if (myrank) {
const auto f=[](){for (int i=0; i<N; ++i) lock_range(mu) MPI_Send(&i, 1, MPI_INT32_T, 0, 0, MPI_COMM_WORLD);};
thread t1(f), t2(f);
t1.join(), t2.join();
} else {
unsigned sum=0;
const auto f=[&sum](){int res; for (int i=0; i<N; ++i) lock_range(mu) MPI_Recv(&res, 1, MPI_INT32_T, 1, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE), sum+=res;};
thread t1(f), t2(f);
t1.join(), t2.join();
printf("sum = %u\n", sum);
}
printf ("Timer #%d: %d\n", myrank, clock());
MPI_Finalize();
return 0;
}
(同样是线程安全的)每次发送和接收花费465ns,或者仅发送一个线程并接收一个N = 1e6,则花费289ns,这是单进程解决方案的10倍以上。
我应该如何提高单进程解决方案的性能,使其至少赶上多进程解决方案?