Question

我有几次执行CUDA内核的问题。我的代码中的环境有些不对劲。第一次代码正常工作，在第三次调用之前第二次清理environemnt时会出现随机崩溃。我认为由于某种原因我有记忆腐败。 CUDA驱动程序中有时会发生崩溃，有时简单的printf崩溃或廉价的kernel32.dll。我想我的代码中存在内存管理问题。

再次执行内核之前应该做些什么？

当我执行一次时，此代码有效。我正在使用CURAND初始化随机生成器。这是我的代码：

    #define GRID_BLOCK 64
    #define GRID_THREAD 8
    #define CITIES 100
    #define CIPOW2 101
    int lenghtPaths = GRID_BLOCK*GRID_THREAD;
    int cities = CITIES; 
    //prepare CURAND 
    curandState *devStates;
    CUDA_CALL(cudaMalloc((void **)&devStates, GRID_BLOCK*GRID_THREAD*sizeof(curandState)));
    /* Setup prng states */
    setup_kernel<<<GRID_BLOCK ,GRID_THREAD>>>(devStates);
    CUDA_CALL(cudaDeviceSynchronize());
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) 
        fprintf(stderr, "CURAND preparation failed: %s\n", cudaGetErrorString(cudaStatus));
    //copy distance grid to constant memory 
    cudaMemcpyToSymbol(cdist, dist, sizeof(int) *CIPOW2*CIPOW2);
    CUDA_CALL(cudaMalloc((void**)&dev_pathsForThreads, lenghtPaths * cities * sizeof(int)));
    CUDA_CALL(cudaMalloc((void**)&d_results, GRID_BLOCK*GRID_THREAD * sizeof(int)));
    for (int k = 0; k < 5; k++){
        int* pathsForThreads;
        pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
        pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);
        CUDA_CALL(cudaMemcpy(dev_pathsForThreads, pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyHostToDevice));
        GPUAnnealing<<<GRID_BLOCK ,GRID_THREAD >>>(dev_pathsForThreads, devStates, iterationLimit,temperature, coolingRate, absoluteTemperature, cities,d_results);
        CUDA_CALL(cudaDeviceSynchronize());
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) 
            fprintf(stderr, "GPUAnnealing launch failed: %s\n", cudaGetErrorString(cudaStatus));
        h_results = (int*) malloc(GRID_BLOCK*GRID_THREAD * sizeof(int));
        //Copy lenght of each path to CPU 
        CUDA_CALL(cudaMemcpy(h_results, d_results,  GRID_BLOCK*GRID_THREAD * sizeof(int),cudaMemcpyDeviceToHost));
        //Copy paths to CPU 
        CUDA_CALL(cudaMemcpy(pathsForThreads, dev_pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyDeviceToHost));
        //check the shortest path                       
        shortestPath = FindTheShortestPath(h_results);
        fprintf (stdout, "Shortest path on index = %d value = %d \n", shortestPath, h_results[shortestPath]);
        for (int i = 0; i < GRID_BLOCK*GRID_BLOCK ; i++)
            Path[i] = pathsForThreads[shortestPath*CITIES +i]; 
        free(pathsForThreads);
        free(h_results);
    }
    CUDA_CALL(cudaFree(dev_pathsForThreads));
    CUDA_CALL(cudaFree(d_results));
    CUDA_CALL(cudaFree(devStates));
    CUDA_CALL(cudaDeviceReset());

Answer 1

这是个坏主意：

    pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
    pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);

如果对PreaparePaths的调用将pathsForThreads的某些其他值分配给malloc操作分配给它的其他值，则稍后执行此操作时

    free(pathsForThreads);

你将获得不可预测的结果。

您不应该将您随后要传递给free的指针重新分配给其他值。免费的手册页指示：

  free() frees the memory space pointed to by ptr, which must  have  been
   returned by a previous call to malloc(), calloc() or realloc().

如果您打算将指针传递给free

，则不允许将指针重新分配给其他内容。

几次执行CUDA内核

1 个答案: