Question

这是我的第一篇文章，我必须说我没有使用CUDA的经验。我正在使用Cuda（C语言）和VS15构建搜索算法。我想在一个大文件（在本例中为Wikipedia转储）中搜索某个单词，并打印找到该单词的所有位置（例如单词的第一个字符的行或位置）。问题是并行程序比串行程序慢得多。

我使用fread（）函数加载文件的前1024个字符，并且每个线程检查加载的字符串的每个字母是否与搜索到的单词的第一个字母相同。如果是这种情况，则检查下一个，依此类推，否则将中断循环。这会在while循环中进行，直到文件结束。我知道不断地向gpu传输数据是很慢的，但是真的那么慢吗？我想问是否有一种方法可以将整个文件传输到gpu并仅调用一次内核，或者是否有更好的方法。

我的程序的内核是：

#define DATASIZE 1024
#define GRID_SIZE 1
__global__  void searchKeywordKernel(int *result, char *data, char *keyword, int *sl)
{

int i = blockDim.x*blockIdx.x + threadIdx.x;
int match = 0;



// Detect the first matching character

if (data[i] == keyword[0]) {
    // Loop through next keyword character
    for (int j = 1; j < *sl; j++) {

        if (data[i + j] == keyword[j]) {
            match =1;
        }
        else {
            // Store the first matching character to the result list
            match = 0;
            break;
        }

    }

    if (match ==1) {
        result[i] = 1;
        printf("Character found at position % i\n", i);
    }
    else { result[i] = 0; }

}


}

主要功能：

int main()
{
//timers

clock_t start, end, tstart, tend;
double cpu_time_used, tcpu_time_used;

char data[DATASIZE];
//Search for keyword
char *keyword = "wiki";
int linenum = 0;
int result[DATASIZE] ;

char *fname = "F:\\simplewiki-20170820-pages-meta-current.xml";
// Set false value in result array
memset(result, 0, DATASIZE);
tstart = clock();
//LOAD FILE
FILE *fp;
int sl = strlen(keyword);
//hotdata(keyword);
//OPEN FILE
if ((fopen_s(&fp, fname, "r")) != NULL) {
    printf("WRONG FILE");
}
int total_matches = 0;
start = clock();
while ((fread(data, DATASIZE, 1, fp) != NULL)) {






    end = clock();

    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf("Time:%f  \n", cpu_time_used);

    total_time = total_time + cpu_time_used;
    // Search keyword in parallel.
    cudaError_t cudaStatus = searchKeyword(result, data, keyword, sl);


if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!");
    return 1;
    }

    // Print out the string match result position

    for (int i = 0; i < DATASIZE; i++) {
        if (result[i] == 1 ) {

            printf("LINE:%d \n", linenum);
            printf("Character found at position % i\n", i);

            total_matches++;
            //printf("FOUND: %s \n", data);
        }
    }
    memset(result, 0, DATASIZE);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
}
    linenum++;


}
tend = clock();

tcpu_time_used = ((double)(tend - tstart)) / CLOCKS_PER_SEC;
printf("Total matches = %d\n", total_matches);
printf("Total Time:%f  \n", total_time);
printf("Total Time of everything:%f  \n", tcpu_time_used);
system("pause");
return 0;

}

在进行内存分配和内存复制时有用的功能：

cudaError_t searchKeyword(int *result, char *data, char *keyword, int       sl)
{
char *dev_data = 0;
char *dev_keyword = 0;
int *dev_result = 0;

int *dev_slp = 0;
int *slp = &sl;

    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);


    // Choose which GPU to run on, change this on a multi-GPU system.

    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
        //  goto Error;
    }
    // Allocate GPU buffers for result set.
    cudaStatus = cudaMalloc((void**)&dev_result, DATASIZE * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        //  goto Error;
    }
    // Allocate GPU buffers for result set.
    cudaStatus = cudaMalloc((void**)&dev_data,DATASIZE * sizeof(char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        //  goto Error;
    }


    // Allocate GPU buffers for keyword.
    cudaStatus = cudaMalloc((void**)&dev_keyword, DATASIZE * sizeof(char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        //  goto Error;
    }
    // Allocate GPU buffers for keyword.
    cudaStatus = cudaMalloc((void**)&dev_slp, DATASIZE * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        //  goto Error;
    }

// Copy input data from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_data, data, DATASIZE * sizeof(char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}


// Copy input data from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_slp, slp, DATASIZE * sizeof(char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}


// Copy keyword from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_keyword, keyword, DATASIZE * sizeof(char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
//timer

// Launch a search keyword kernel on the GPU with one thread for each element.
searchKeywordKernel <<<GRID_SIZE, DATASIZE >>>(dev_result, dev_data, dev_keyword, dev_slp);

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
}
// Copy result from GPU buffer to host memory.
cudaStatus = cudaMemcpy(result, dev_result, DATASIZE * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
Error:
cudaFree(dev_result);
cudaFree(dev_data);
cudaFree(dev_keyword);

return cudaStatus;

}

我的GPU是660Ti。请忽略我的代码中可以更改的一些逻辑问题，使我的程序更快，并专注于真正的问题，即真正的缓慢时间和while循环中CUDA内存复制的“不良”使用，我认为这是我的主要问题。该程序比串行程序慢约50倍。感谢您的帮助。

为什么我的Cuda搜索算法比串行算法慢？

0 个答案: