Question

我正在编写代码，其中一个线程生成整数，另一个线程使用它们：

#include <mutex>
#include <condition_variable>
#include <thread>
namespace custom {
    template<class T, unsigned N>
    class factory {
        T qu[N+1];
        unsigned push_ptr, pop_ptr;
        std::mutex lk;
        std::condition_variable cv_push, cv_pop;
        inline int count() {
            return push_ptr>pop_ptr?pop_ptr+N+1-push_ptr:pop_ptr-push_ptr;
        }
    public:
        void push(T x) {
            std::unique_lock ulk(lk);
            cv_push.wait(ulk, [this](){return count()<N;});
            qu[push_ptr++] = x;
            if (push_ptr>N) push_ptr=0;
            ulk.unlock();
            cv_pop.notify_one();
        }
        T pop() {
            std::unique_lock ulk(lk);
            cv_pop.wait(ulk, [this](){return count()>0;});
            T ret = qu[pop_ptr++];
            if (pop_ptr>N) pop_ptr=0;
            ulk.unlock();
            cv_push.notify_one();
            return ret;
        }
    };
}
custom::factory<int, 64> factory_buffer;
const int N = 1000000;
int S=0;
void fun1() {
    for (int i=0; i<N; ++i) S+=factory_buffer.pop();
}
int main() {
    std::thread th1(fun1);
    for (int i=0; i<N; ++i) factory_buffer.push(i);
    th1.join();
}

运行结果表明，每对发布和获取的成本为6.6us，略低于使用的1个内核。然而，使用MPI似乎表现更好：

#include <stdio.h>
#include <stdlib.h>
#include <thread>
#include <mutex>
#define lock_range(...) if(custom::__autolock lock_range_temp=(__VA_ARGS__))
namespace custom { // Didn't know unique_lock does this thing
    template<class T>
    struct __autolock {
        T& lock;
        __autolock(T& lock): lock(lock) {
            lock.lock();
        }
        ~__autolock() {
            lock.unlock();
        }
        operator bool() {return 1;}
    };
}
#include <atomic>
#include <time.h>
#include <mpi.h>
#include <string.h>
#define debugflag 0
std::mutex mu;
int main (int argc, char *argv[]) {
    using std::thread;
   int myrank,i,nprocs;
    const int N = 100000000;
    // 93114ms for N=1e8
 {int provided;
    MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE, &provided);
    if(provided != MPI_THREAD_MULTIPLE)      { // My env only support MPI_THREAD_SERIALIZED
        printf("MPI do not Support Multiple thread: %d\n", provided);
        //exit(0);
    }  }

    //MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank (MPI_COMM_WORLD, &myrank);
    MPI_Status status;


    if (myrank) {
        const auto f=[](){for (int i=0; i<N; ++i) lock_range(mu) MPI_Send(&i, 1, MPI_INT32_T, 0, 0, MPI_COMM_WORLD);};
        thread t1(f), t2(f);
        t1.join(), t2.join();
    } else {
        unsigned sum=0;
        const auto f=[&sum](){int res; for (int i=0; i<N; ++i) lock_range(mu) MPI_Recv(&res, 1, MPI_INT32_T, 1, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE), sum+=res;};
        thread t1(f), t2(f);
        t1.join(), t2.join();
        printf("sum = %u\n", sum);
    }

    printf ("Timer #%d: %d\n", myrank, clock());
    MPI_Finalize();
    return 0;
}

（同样是线程安全的）每次发送和接收花费465ns，或者仅发送一个线程并接收一个N = 1e6，则花费289ns，这是单进程解决方案的10倍以上。

我应该如何提高单进程解决方案的性能，使其至少赶上多进程解决方案？

工厂多线程队列

0 个答案: