Question

所有

我在simpleMultiCopy.cu中提到CUDA SDK 4.0并写了一个，请参阅下面的代码。

simpleMultiCopy.cu是循环中重叠的操作示例。而且我的类似，它将向GPU发送一片数据来计算循环中的每次迭代，我执行重叠操作。

这只是一个测试/演示，不关心内核的逻辑（increment_kernel），它只是用来延迟一段时间。主要逻辑在于processWithStreams函数。但是这个程序在这个输出上运行不正确：

i: 0, current_stream: 0, next_stream: 1
i: 1, current_stream: 1, next_stream: 0
Cuda error in file 'ttt.cu' in line 132 : unspecified launch failure.

第132行是：

CUDA_SAFE_CALL( cudaMemcpyAsync(
            d_data_in[next_stream], 
            h_data_in[next_stream], 
            memsize, 
            cudaMemcpyHostToDevice, 
            stream[next_stream]) ); //this is line 132

我对CUDA的工作方式没有太多想法，所以请帮助。

任何帮助都将不胜感激。

<小时/> 的代码：

#include <stdio.h>
#include <cutil_inline.h>

float processWithStreams(int streams_used);
#define STREAM_COUNT    2

int N = 1 << 24;

int *h_data_source;
int *h_data_sink;

int *h_data_in[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];

int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT];

cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT];

cudaEvent_t start, stop;

dim3 block(512);
dim3 grid;

int memsize;

__global__ void increment_kernel(int *g_data, int inc_value)
{ 
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   //g_data[idx] = g_data[idx] + inc_value;

   int i = blockDim.x * gridDim.x;
   for(; i > 0; i /= 2)
   {
        if(idx > i)
            g_data[idx]++;
   }
}


int main(int argc, char *argv[])
{
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId());

    h_data_source = (int *)malloc(sizeof(int) * N);
    memset(h_data_source, 0, sizeof(int) * N);

    int i;
    memsize = 1024 * 1024 * sizeof(int);
    for(i = 0; i < STREAM_COUNT; i++)
    {
        CUDA_SAFE_CALL( cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault) );
        CUDA_SAFE_CALL( cudaMalloc(&d_data_in[i], memsize) );

        CUDA_SAFE_CALL( cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault) );
        CUDA_SAFE_CALL( cudaMalloc(&d_data_out[i], memsize) );

        CUDA_SAFE_CALL( cudaStreamCreate(&stream[i]) );
        CUDA_SAFE_CALL( cudaEventCreate(&cycleDone[i]) ); 

        cudaEventRecord(cycleDone[i], stream[i]);
    }

    CUDA_SAFE_CALL( cudaEventCreate(&start) );
    CUDA_SAFE_CALL( cudaEventCreate(&stop) );

    grid.x = N / block.x;
    grid.y = 1;



    float time1 = processWithStreams(STREAM_COUNT);
    printf("time: %f\n", time1);



    free( h_data_source );
    free( h_data_sink );

    for( i = 0; i < STREAM_COUNT; ++i ) {

        cudaFreeHost(h_data_in[i]);
        cudaFree(d_data_in[i]);

        cudaStreamDestroy(stream[i]);
        cudaEventDestroy(cycleDone[i]);
    }

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cudaThreadExit();
    cutilExit(argc, argv);

    return 0;
}

float processWithStreams(int streams_used) {
    int current_stream = 0;
    float time;

    cudaEventRecord(start, 0);
    for( int i=0; i < N / 1024 / 1024; ++i ) {
        int next_stream = (current_stream + 1 ) % streams_used;
        printf("i: %d, current_stream: %d, next_stream: %d\n", i, current_stream, next_stream);

        // Ensure that processing and copying of the last cycle has finished
        cudaEventSynchronize(cycleDone[next_stream]);

        // Process current frame
        increment_kernel<<<grid, block, 0, stream[current_stream]>>>(
            d_data_in[current_stream], 1);

        // Upload next frame
        CUDA_SAFE_CALL( cudaMemcpyAsync(
            d_data_in[next_stream], 
            h_data_in[next_stream], 
            memsize, 
            cudaMemcpyHostToDevice, 
            stream[next_stream]) );

        CUDA_SAFE_CALL( cudaEventRecord(
            cycleDone[next_stream], 
            stream[next_stream]) );

        // Download current frame
        CUDA_SAFE_CALL( cudaMemcpyAsync(
            h_data_out[current_stream], 
            d_data_out[current_stream], 
            memsize, 
            cudaMemcpyDeviceToHost, 
            stream[current_stream]) );

        CUDA_SAFE_CALL( cudaEventRecord(
            cycleDone[current_stream], 
            stream[current_stream]) );

        current_stream = next_stream;
    }
    cudaEventRecord(stop, 0);    
    cudaEventElapsedTime(&time, start, stop);
    return time;
}

Answer 1

问题出在您的内核中。在检查CUDA中的错误时发生的一件事是，下次检查错误时将报告先前发生但未检查的错误。该行是在内核启动后第一次检查错误，它返回了您看到的错误。

如果我没记错的话，错误未指定的启动失败通常与对内存的越界访问有关。

您正在使用32768个块启动内核，每个块有512个线程。计算最后一个块的最后一个线程的idx值我们有32767 * 512 + 511 = 16777215.在第一次迭代中，idx＆lt;我和以下的那些，当你只为1024 * 1024整数分配空间时，你试图读取和写入g_data的16777215位置。

编辑：刚注意到，为什么标签运算符重载？

CUDA操作问题重叠的例子

1 个答案: