Question

我目前正在研究我在CUDA的第一个项目，我遇到了一些奇怪的事情，这些事情必定是CUDA固有的，我不理解或忽略了。相同的算法 - 完全相同的算法，它不涉及并行工作 - 在CPU上工作但在GPU上不工作。

让我更详细地解释一下。我正在使用Otsu's method重复计算进行阈值处理，但减少了传输时间。短篇小说，这个功能：

__device__ double computeThreshold(unsigned int* histogram, int* nbPixels){
    double sum = 0;
    for (int i = 0; i < 256; i++){
        sum += i*histogram[i];
    }
    int sumB = 0, wB = 0, wF = 0;
    double mB, mF, max = 1, between = 0, threshold1 = 0, threshold2 = 0;
    for (int j = 0; j < 256 && !(wF == 0 && j != 0 && wB != 0); j++){
        wB += histogram[j];
        if (wB != 0) {
            wF = *nbPixels - wB;
            if (wF != 0){
                sumB += j*histogram[i];
                mB = sumB / wB;
                mF = (sum - sumB) / wF;
                between = wB * wF *(mB - mF) *(mB - mF);
                if (max < 2.0){
                    threshold1 = j;
                    if (between > max){
                        threshold2 = j;
                    }
                    max = between;
                }
            }
        }
    }

    return (threshold1 + threshold2) / 2.0;
}

这对预期的图像大小（即像素数）不是太大而是失败;有趣的是，即使我不在函数中使用histogram和nbPixels并用常量替换它们的所有出现次数，它仍然会失败 - 即使我从函数中删除了参数。（我的意思是失败是调用内核后的第一个操作返回一个未指定的启动失败。）

编辑3：好的，由于我在测试之前提供的复制/粘贴错误，出现了一个小错误。现在这个编译并允许重现错误：

__device__ double computeThreshold(unsigned int* histogram, long int* nbPixels){
    double sum = 0;
    for (int i = 0; i < 256; i++){
        sum += i*histogram[i];
    }
    int sumB = 0, wB = 0, wF = 0;
    double mB, mF, max = 1, between = 0, threshold1 = 0, threshold2 = 0;
    for (int j = 0; j < 256 && !(wF == 0 && j != 0 && wB != 0); j++){
        wB += histogram[j];
        if (wB != 0) {
            wF = *nbPixels - wB;
            if (wF != 0){
                sumB += j*histogram[j];
                mB = sumB / wB;
                mF = (sum - sumB) / wF;
                between = wB * wF *(mB - mF) *(mB - mF);
                if (max < 2.0){
                    threshold1 = j;
                    if (between > max){
                        threshold2 = j;
                    }
                    max = between;
                }
            }
        }
    }

    return (threshold1 + threshold2) / 2.0;
}

__global__ void imageKernel(unsigned int* image, unsigned int* histogram, long int* nbPixels, double* t_threshold){

    unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (i >= *nbPixels) return;
    double threshold = computeThreshold(histogram, nbPixels);
    unsigned int pixel = image[i];
    if (pixel >= threshold){
        pixel = 255;
    } else {
        pixel = 0;
    }
    image[i] = pixel;

    *t_threshold = threshold;

}


int main(){
unsigned int histogram[256] = { 0 };
const int width = 2048 * 4096;
const int height = 1;

unsigned int* myimage;
myimage = new unsigned int[width*height];
for (int i = 0; i < width*height; i++){
    myimage[i] = i % 256;
    histogram[i % 256]++;
}
const int threadPerBlock = 256;
const int nbBlock = ceil((double)(width*height) / threadPerBlock);
unsigned int* partial_histograms = new unsigned int[256 * nbBlock];

dim3 dimBlock(threadPerBlock, 1);
dim3 dimGrid(nbBlock, 1);
unsigned int* dev_image;
unsigned int* dev_histogram;
unsigned int* dev_partial_histograms;
double* dev_threshold;
double x = 0;
double* threshold = &x;
long int* nbPixels;
long int nb = width*height;
nbPixels = &(nb);
long int* dev_nbPixels;

cudaSetDevice(0);
cudaMalloc((void**)&dev_image, sizeof(unsigned int)*width*height);
cudaMalloc((void**)&dev_histogram, sizeof(unsigned int)* 256);
cudaMalloc((void**)&dev_partial_histograms, sizeof(unsigned int)* 256 * nbBlock);
cudaMalloc((void**)&dev_threshold, sizeof(double));
cudaMalloc((void**)&dev_nbPixels, sizeof(long int));
cudaMemcpy(dev_image, myimage, sizeof(unsigned int)*width*height, cudaMemcpyHostToDevice);
cudaMemcpy(dev_histogram, histogram, sizeof(unsigned int)* 256, cudaMemcpyHostToDevice);
cudaMemcpy(dev_nbPixels, nbPixels, sizeof(long int), cudaMemcpyHostToDevice);





imageKernel<<<dimGrid, dimBlock>>>(dev_image, dev_histogram, dev_nbPixels, dev_threshold);



cudaMemcpy(histogram, dev_histogram, sizeof(unsigned int)* 256, cudaMemcpyDeviceToHost);
cudaMemcpy(partial_histograms, dev_partial_histograms, sizeof(unsigned int)* 256 * nbBlock, cudaMemcpyDeviceToHost);
cudaMemcpy(threshold, dev_threshold, sizeof(double), cudaMemcpyDeviceToHost);

cudaDeviceReset();


return 0;
}

编辑4：我的GPU的特性

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "GeForce GT 750M"
  CUDA Driver Version / Runtime Version          7.5 / 7.5
  CUDA Capability Major/Minor version number:    3.0
  Total amount of global memory:                 2048 MBytes (2147483648 bytes)
  ( 2) Multiprocessors, (192) CUDA Cores/MP:     384 CUDA Cores
  GPU Max Clock rate:                            1085 MHz (1.09 GHz)
  Memory Clock rate:                             900 Mhz
  Memory Bus Width:                              128-bit
  L2 Cache Size:                                 262144 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536),
3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Mo
del)
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simu
ltaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.5, CUDA Runtime Versi
on = 7.5, NumDevs = 1, Device0 = GeForce GT 750M
Result = PASS

编辑5：我再次运行cuda-memcheck，这一次，它确实输出了一条错误信息。我不知道为什么它第一次没有，我一定做错了。我希望你能原谅我那些犹豫和浪费时间。这是输出消息：

========= CUDA-MEMCHECK
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launc
h failure" on CUDA API call to cudaMemcpy.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xb780
2) [0xdb1e2]
=========     Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0x160f]
=========     Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xc764]
=========     Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xfe24]
=========     Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk +
 0x22) [0x13d2]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x3
4) [0x15454]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launc
h failure" on CUDA API call to cudaMemcpy.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xb780
2) [0xdb1e2]
=========     Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0x160f]
=========     Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xc788]
=========     Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xfe24]
=========     Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk +
 0x22) [0x13d2]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x3
4) [0x15454]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launc
h failure" on CUDA API call to cudaMemcpy.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuProfilerStop + 0xb780
2) [0xdb1e2]
=========     Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0x160f]
=========     Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xc7a6]
=========     Host Frame:C:\Users\Nicolas\Cours\3PC\test.exe [0xfe24]
=========     Host Frame:C:\WINDOWS\system32\KERNEL32.DLL (BaseThreadInitThunk +
 0x22) [0x13d2]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x3
4) [0x15454]
=========
========= ERROR SUMMARY: 3 errors

虽然不是很有说服力，是吗？

Answer 1

所以在提供了一个可编译的例子之后（真的这么难吗？），我无法用这段代码重现任何错误（64位linux，计算3.0设备，CUDA 7.0发布版本）：

$ nvcc -arch=sm_30 -Xptxas="-v" histogram.cu 
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z11imageKernelPjS_PlPd' for 'sm_30'
ptxas info    : Function properties for _Z11imageKernelPjS_PlPd
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 34 registers, 352 bytes cmem[0], 16 bytes cmem[2]

$ for i in `seq 1 20`;
> do
>     cuda-memcheck ./a.out
> done
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors

因此，如果您可以像我一样重现运行时错误，那么您的环境/硬件/工具包版本在某种程度上与我的略有不同。但无论如何，代码本身都有效，而且你有一个我无法重现的平台特定问题。

Answer 2

好的，事实证明这不是我方的错误，但Windows决定2s就够了，它需要重置GPU - 停止我的计算。非常感谢@RobertCrovella，没有他我永远不会发现这一点。感谢所有试图回答的人。

CUDA - 相同的算法适用于CPU但不适用于GPU

2 个答案: