Question

我想使用CUDA中的Page-locked Host memory在主机和设备之间共享消息，让我通过以下示例表达我的想法。我不确定这是否合理。

我的机器的环境：

 - Ubuntu 14.04.5 LTS
 - gcc (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4
 - CUDA 9.1

我将程序分为四个步骤，如下所示：


我们假设有两个块，并且对于第一个块，它进行了一些计算，并且在第一个块的末尾生成了一个信号   块;

当第一个块完成功能时，它通知CPU终端，然后在CPU中组织相应的数据；

然后，将数据复制到gpu，并在数据复制完成后向gpu发出信号；

根据步骤3中的信号触发gpu中的第二个块。

了解了我想做的事后，我遇到了一个问题，当我更改页锁内存中的数据（在我的程序中是信号）时，似乎无法通过对面的设备。

对于这个问题，我尝试了以下方法

我发现CUDA编译器可以优化该值并将其存储在寄存器中，因此我无法在内核中获取最新的值，因此我注意到PTX。
我试图利用PTX来防止编译器优化部分代码，但我成功地在内核中获取了信号，但未能将信号形式的设备传递给主机让我很困惑。

我的项目的部分代码如下所示：

__global__ void pipeline(int *flag_a, int*flag_b, int*Input, int*Out){
    int idx = threadIdx.x;
    if (blockIdx.x == 0) {
        if (0 == idx) {
            flag_a[0] = 1;    //to generate signal in the step one 
                              //why the host cannot get the flag_a[0]==1?
        }
    }

    if (blockIdx.x == 1) {
        if (0 == idx) {
            int value = 0;
            do{
                asm volatile("ld.global.cg.u32 %0, [%1];" :"=r"(value) : "l"(&flag_b[0]));
                //receipt signal form the host generate in step 3
                //and the asm volatile to make sure I can get the newest flag_b[0]
            } while (value != 1);
        }
        __syncthreads();
        Out[idx] = Input[idx] + idx;
    }
}

int main()
{
    /*1*/
    int *flag_a, *flag_b;
    cudaHostAlloc((void**)&flag_a, sizeof(int), cudaHostAllocMapped);
    cudaHostAlloc((void**)&flag_b, sizeof(int), cudaHostAllocMapped);
    flag_a[0] = 0;
    flag_b[0] = 0;
    /*2*/
    int*Input, *Out;
    int *d_Input, *d_Out;
    int*d_float_a, *d_float_b;
    Input = (int*)malloc(sizeof(int) * 32);
    Out = (int*)malloc(sizeof(int) * 32);
    for (int i = 0; i<32; i++) {
        Input[i] = i;
    }
    memset(Out, 0, sizeof(int) * 32);

    cudaMalloc((void**)&d_Input, sizeof(int) * 32);
    cudaMemset(d_Input, 0, sizeof(int) * 32);
    cudaMalloc((void**)&d_Out, sizeof(int) * 32);
    cudaMemset(d_Out, 0, sizeof(int) * 32);

    cudaHostGetDevicePointer((void **)&d_float_a, (void *)flag_a, 0);
    cudaHostGetDevicePointer((void **)&d_float_b, (void *)flag_b, 0);

    cudaStream_t stream_kernel, stream_datacopy;
    cudaStreamCreate(&stream_kernel);
    cudaStreamCreate(&stream_datacopy);

    pipeline <<< 2, 32, 0, stream_kernel >>> (d_float_a, d_float_b, d_Input, d_Out);
    int count = 0;
    do{
        if (flag_a[0]==1){
            cudaMemcpyAsync(d_Input, Input, sizeof(int) * 32, cudaMemcpyHostToDevice, stream_datacopy);
            cudaStreamSynchronize(stream_datacopy);
            flag_b[0] = 1;  //step 3；
            break;
        }
        if (count==10)
            break;
    } while (1 != flag_a[0]);

    cudaStreamSynchronize(stream_kernel);
    cudaMemcpy(Out, d_Out, sizeof(int) * 32, cudaMemcpyDeviceToHost);
    for (int i = 0; i<32; i++) {
        printf("%d:%d\n", i, Out[i]);
    }
    // free()
    return 0;
}

我不是很擅长CUDA编程，我不确定这是否是在主机和设备之间切换信号的正确方法，我所做的只是尝试，如果有人可以给我建议，我将不胜感激，请先感谢：）

Answer 1

最后，我删除了PTX部分的代码，并将代码放入Tesla P100-PCIE（TCC mode）中，可以正确运行我期望的程序。感谢RobertCrovella在评论中给出的提示。

这是更新的代码和结果。

__global__ void pipeline(volatile float *flag_a, volatile float*flag_b, int*Input, int*Out)
{
    int idx = threadIdx.x;
    if (blockIdx.x == 0) {
        if (0 == idx) {
            flag_a[idx] = 1;    
        }
    }

    if (blockIdx.x == 1) {
        if (0 == idx) {
            while (!(1 == flag_b[0])) {
            }
        }
        __syncthreads();
        Out[idx] = Input[idx] + idx;
    }
}

在主要功能上可以从内核获取信号。

int main()
{
    //Data definition
    pipeline << < 2, 32, 0, stream_kernel >> > (flag_a, flag_b, d_Input, d_Out);
    while (flag_a[0] == 0);
    if (flag_a[0] == 1)
    {
        std::cout << "get the flag_a[0]==1" << std::endl;
        cudaMemcpyAsync(d_Input, Input, sizeof(int) * 32, cudaMemcpyHostToDevice, stream_datacopy);
        cudaStreamSynchronize(stream_datacopy);
        flag_b[0] = 1;
        std::cout << "data transfer has finished" << std::endl;
    }

    cudaStreamSynchronize(stream_kernel);
    cudaMemcpy(Out, d_Out, sizeof(int) * 32, cudaMemcpyDeviceToHost);
    for (int i = 0; i < 32; i++) 
    {
        printf("%d:%d\n", i, Out[i]);
    }
    //free the memory;
    return 0;
}

这里是result。

如何通过页面锁定主机内存共享变量

1 个答案: