Question

我正在尝试执行以下操作：我假设我有一个带有异构处理单元（PU）的系统，包括CPU，GPU和Intel Xeon Phis。 GPU设备也可以具有不同的特征。因此，在这些设备之间拆分工作负载并不像N / num_devices那么简单。

omp_set_num_threads(system->getPUCount());
#pragma omp parallel
{
    unsigned int cpu_thread_id = omp_get_thread_num();
    unsigned int num_cpu_threads = omp_get_num_threads();

每个线程循环迭代，直到到达数据末尾。

    PU pu = listOfPUs[cpu_thread_id];

    //threads are active until all data is processed
    while (finish_0 < N) {
        //the my_start and my_finish are private to a specific device.
        int my_start = 0;
        int my_finish = 0;

我为每个PU设置了一个常量chunk_size，并且我创建了与我有PU一样多的CPU线程，这意味着每个CPU线程控制一个PU。每个线程确定自己的数据块的开始和结束（关键代码部分）

#pragma omp critical (chunkdetermination_0)
{
    start_0 = finish_0;
    finish_0 = start_0 + pu.getChunkSize();

    if(finish_0 > N)
        finish_0 = N;

    my_start = start_0;
    my_finish = finish_0;
}

现在我检查PU的类型，并执行相应的内核。

if(pu.getType() == GPU) {

            int myN = my_finish-my_start;

            CudaSafeCall(cudaSetDevice(pu.getId()));

            unsigned int nbytes_per_kernel = sizeof(double)*myN;

            //memory allocation
            CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
            CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));

            CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
            CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
            //data transfer
            CudaSafeCall(cudaMemcpy(d_a, a+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
            CudaSafeCall(cudaMemcpy(d_c, c+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));


            //block and grid values
            dim3 gpu_threads(128);
            dim3 gpu_blocks(myN/gpu_threads.x);
            if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;

            //execute kernel
            kernel_0<<<gpu_blocks,gpu_threads>>>( d_a,  d_c, myN);

            //data transfer device to host
            CudaSafeCall(cudaMemcpy(c+my_start, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));

            //sycnhronize devices
            CudaSafeCall(cudaDeviceSynchronize());

            // //free device memory
            CudaSafeCall(cudaFree(d_a));
            CudaSafeCall(cudaFree(d_c));
        }

当我用一个GPU测试这个代码时，它工作正常。但是，当我用两个GPU测试它时，它不起作用。我也尝试过使用cuda流，但遗憾的是无法管理它。

任何建议我做错了什么，或者我该如何解决这个问题？

以下是完整的示例：

#include <omp.h>
#include <stdio.h>
#include <vector>
#include <iostream>
#include <sys/time.h>
#include <float.h>
#include <limits.h>

using namespace std;

#define CPU 0
#define GPU 1
#define MIC 2

class PU
{
public:
    PU(int puId, int puType)
    {
        id = puId;
        type = puType;
    }

    int getId() {
        return id;
    }

    void setId(int puId) {
        id = puId;
    }

    int getType() {
        return type;
    }

    char * getTypeAsString() {
        if(type == CPU)
            return (char *) "CPU";
        else if (type == GPU)
            return (char *) "GPU";
        else
            return (char *) "MIC";

    }

    void setType(int puType) {
        type = puType;
    }

    int getChunkSize() {
        return chunkSize;
    }

    void setChunkSize(int puChunkSize) {
        chunkSize = puChunkSize;
    }

private:
    int id;
    int type;
    int chunkSize;
};

class System
{
public:
    System() {
        numOfPUs = 0;

        //Adding PU0 of type GPU to the system
        PU * pu0 = new PU(0, GPU);
        pu0->setChunkSize(262144);
        listOfPUs.push_back(*pu0);
        numOfPUs ++;

        //Adding PU1 of type GPU to the system
        PU * pu1 = new PU(1, GPU);
        pu1->setChunkSize(262144);
        listOfPUs.push_back(*pu1);
        numOfPUs ++;

    }

    vector<PU> getPUs() {
        return listOfPUs;
    }

    int getPUCount() {
        return numOfPUs;
    }

private:
    vector<PU> listOfPUs;

    int numOfPUs;
};


#define N   2097152


//********************** CUDA Error checker **********************
#define CUDA_ERROR_CHECK

#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )

inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
                 file, line, cudaGetErrorString( err ) );
        exit( -1 );
    }
#endif

    return;
}

inline void __cudaCheckError( const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
    cudaError err = cudaGetLastError();
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
                 file, line, cudaGetErrorString( err ) );
        exit( -1 );
    }

    // More careful checking. However, this will affect performance.
    // Comment away if needed.
    err = cudaDeviceSynchronize();
    if( cudaSuccess != err )
    {
        fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
                 file, line, cudaGetErrorString( err ) );
        exit( -1 );
    }
#endif

    return;
}
//********************** CUDA Error checker **********************


__global__ void kernel_0(double * a, double * c, int len)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < len)
    {
        c[idx] = a[idx];
    }
}
/*
 * Start of Generated Code
 * This code enables execution on both host CPUs and accelerating devices
 */
void hybrid_function_0 (double *a, double *c)
{
    System * system = new System();

    //device variable declarations
    double *d_a;
    double *d_c;

    //start and finish determine the chunk size of a device
    int start_0     = 0;
    int finish_0    = 0;

    vector<PU> listOfPUs = system->getPUs();

    printf("Num devices = %d\n", system->getPUCount());
    omp_set_num_threads(system->getPUCount());
    //one thread of the CPU controls one GPU device. The rest of CPU threads will be used to process data.
    #pragma omp parallel
    {
        unsigned int cpu_thread_id = omp_get_thread_num();
        unsigned int num_cpu_threads = omp_get_num_threads();

        PU pu = listOfPUs[cpu_thread_id];

        //threads are active until all data is processed
        while (finish_0 < N) {
            //the my_start and my_finish are private to a specific device.
            int my_start = 0;
            int my_finish = 0;

            //the determination of chunks should be performed sequentially, in order to avoid two or more devices processing the same data.
            #pragma omp critical (chunkdetermination_0)
            {
                start_0 = finish_0;
                finish_0 = start_0 + pu.getChunkSize();

                if(finish_0 > N)
                    finish_0 = N;

                my_start = start_0;
                my_finish = finish_0;
            }

            //devices with id less than nDevices are GPU devices. The host CPU has id = nDevices
            if(pu.getType() == GPU) {

                int myN = my_finish-my_start;

                printf("device_id\t%d\tpu_id\t%d\ttype\t%s\tprocessing\t%d-%d (%lu KB)\n", cpu_thread_id, pu.getId(), pu.getTypeAsString(), my_start, my_finish, sizeof(double)*myN/1000);
                CudaSafeCall(cudaSetDevice(pu.getId()));

                unsigned int nbytes_per_kernel = sizeof(double)*myN;

                //memory allocation
                CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
                CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));

                CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
                CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
                //data transfer
                CudaSafeCall(cudaMemcpy(d_a, a+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
                CudaSafeCall(cudaMemcpy(d_c, c+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));


                //block and grid values
                dim3 gpu_threads(128);
                dim3 gpu_blocks(myN/gpu_threads.x);
                if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;

                //execute kernel
                kernel_0<<<gpu_blocks,gpu_threads>>>( d_a,  d_c, myN);

                //data transfer device to host
                CudaSafeCall(cudaMemcpy(c+my_start, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));

                //sycnhronize devices
                CudaSafeCall(cudaDeviceSynchronize());

                // //free device memory
                CudaSafeCall(cudaFree(d_a));
                CudaSafeCall(cudaFree(d_c));
            }
            //execute on host
            else if (pu.getType() == CPU) {
                omp_set_num_threads(omp_get_max_threads());
                #pragma omp parallel for
                for (int  i = my_start; i < my_finish; i++)
                {
                    c[i] = a[i];
                }
            }
            //execute on MIC
            else if (pu.getType() == MIC) {
                #pragma offload target(mic: cpu_thread_id) in(a[my_start:my_finish]) in(c[my_start:my_finish])  out(c[my_start:my_finish])
                {
                    #pragma omp parallel for
                    for (int  i = my_start; i < my_finish; i++)
                    {
                        c[i] = a[i];
                    }
                }
            }
        }
    }
}
/*
 * End of Generated Code
 */

int main()
{

    double *a, *b, *c;
    double scalar;

    /* Allocate memory on host */
    a = (double*)malloc(sizeof(double)*N);
    b = (double*)malloc(sizeof(double)*N);
    c = (double*)malloc(sizeof(double)*N);

    // omp_set_num_threads(omp_get_max_threads());

    printf("OMP Max threads %d\n", omp_get_max_threads());
    #pragma omp parallel
    {
        #pragma omp master
        printf("OMP Num threads %d\n", omp_get_num_threads());
    }

    // #pragma omp parallel for
    // for(int i = 0; i < 10; i++) {
    //     printf("I am thread %d\n", omp_get_thread_num());
    // }
    //initialization of variables
    #pragma omp parallel for
    for (int j=0; j<N; j++) {
        a[j] = 1.0;
        b[j] = 2.0;
        c[j] = 0.0;
    }

    #pragma omp parallel for
    for (int j = 0; j < N; j++)
        a[j] = 2.0E0 * a[j];

    scalar=3.0f;

         printf("%s\n", "COPY Started");
         hybrid_function_0(a, c);
         printf("%s\n", "COPY Finished");

    return 0;
}

你可以用：

编译它

nvcc mini.cu -o mini -Xcompiler "-fopenmp"

当分配两个或多个GPU来运行此类时，我会收到不同的错误消息，有时它会挂起并且什么都不做。当我通过nvidia-smi检查GPU状态时，它显示GPU正在执行此类，但利用率为0％。

有些错误包括：

cudaSafeCall() failed at mini.cu:221 : invalid argument
cudaSafeCall() failed at mini.cu:221 : driver shutting down

Answer 1

我想应该将hybrid_function_0中的设备变量声明移到主OpenMP并行部分中，如下所示：

#pragma omp parallel
{
    unsigned int cpu_thread_id = omp_get_thread_num();
    unsigned int num_cpu_threads = omp_get_num_threads();

    //device variable declarations
    double *d_a;
    double *d_c;
    ...

就目前而言，线程之间存在内存争用，这可能导致设备变量指针值被其他线程从错误设备分配的值所覆盖，从而导致您在API调用中报告各种无效参数错误，这些错误使用设备指针作为参数。

我希望该错误可能在任何cudaMemset，cudaMemcpy或cudaFree调用中发生，或者该函数中的内核启动，并且该错误可能会在不同地方发生取决于竞争线程的行为。

CUDA + OpenMP多GPU批处理

1 个答案: