我无法调试我为其设置断点的“全局”功能行。我使用NSight菜单中的“启动CUDA调试”选项进行调试。
我的NSight插件已成功安装到VS 2010,我可以调试我的其他项目(示例项目来自NSight调试器)
我的代码在这里(它有点长,但通常会重复相同的功能):
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_cuda.h>
#include "book.h"
#define N (131072)
__global__ void conjugate( float2 *a ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
a[idx] = cuConjf(a[idx]);
}
}
__global__ void multWithReference( float2 *signal, float2 *reference ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
signal[idx].x = signal[idx].x * reference[idx].x;
signal[idx].y = signal[idx].y * reference[idx].y;
}
}
__global__ void shift( float2 *signal, size_t shiftamount, float2* shifted ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
*(shifted+((idx+shiftamount)%131072)) = *(signal+idx);
}
__global__ void fftshift(float2 *u_d)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < 131072)
{
double a = 1-2*(i&1);
u_d[i].x *= a;
u_d[i].y *= a;
}
}
static inline cufftHandle createFFTPlan(cudaStream_t* stream)
{
cufftHandle plan;
if (cudaGetLastError() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to allocate\n");
}
if (cufftPlan1d(&plan, 131072, CUFFT_C2C,1) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan creation failed");
}
if (cufftSetStream(plan, *stream) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan stream association failed");
}
return plan;
}
int main( void ) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
if (!prop.deviceOverlap) {
printf( "Device will not handle overlaps, so no speed up from streams\n" );
return 0;
}
cudaEvent_t start, stop;
float elapsedTime;
cudaStream_t stream0, stream1, stream2, stream3, stream4, stream5, stream6, stream7;
float2* host_ref, *host_0, *host_1, *host_2, *host_3, *host_4, *host_5, *host_6, *host_7;
float2* dev_ref, *dev_0, *dev_1, *dev_2, *dev_3, *dev_4, *dev_5, *dev_6, *dev_7;
// start the timers
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
// initialize the streams
HANDLE_ERROR( cudaStreamCreate( &stream0 ) );
HANDLE_ERROR( cudaStreamCreate( &stream1 ) );
HANDLE_ERROR( cudaStreamCreate( &stream2 ) );
HANDLE_ERROR( cudaStreamCreate( &stream3 ) );
HANDLE_ERROR( cudaStreamCreate( &stream4 ) );
HANDLE_ERROR( cudaStreamCreate( &stream5 ) );
HANDLE_ERROR( cudaStreamCreate( &stream6 ) );
HANDLE_ERROR( cudaStreamCreate( &stream7 ) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_ref,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_0,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_1,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_2,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_3,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_4,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_5,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_6,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_7,
N * sizeof(float2) ) );
// allocate host locked memory, used to stream
HANDLE_ERROR( cudaHostAlloc( (void**)&host_ref,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_0,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_1,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_2,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_3,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_4,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_5,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_6,
N * sizeof(float2),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_7,
N * sizeof(float2),
cudaHostAllocDefault ) );
// Open signal file
FILE *fp;
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_ref, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_0, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_1, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_2, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_3, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_4, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_5, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_6, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_7, sizeof(float2), 131072, fp);
fclose(fp);
// create FFT plans
cufftHandle plan0 = createFFTPlan(&stream0);
cufftHandle plan1 = createFFTPlan(&stream1);
cufftHandle plan2 = createFFTPlan(&stream2);
cufftHandle plan3 = createFFTPlan(&stream3);
cufftHandle plan4 = createFFTPlan(&stream4);
cufftHandle plan5 = createFFTPlan(&stream5);
cufftHandle plan6 = createFFTPlan(&stream6);
cufftHandle plan7 = createFFTPlan(&stream7);
float2* shifted0;
HANDLE_ERROR( cudaMalloc( (void**)&shifted0,
N * sizeof(float2) ) );
float2* shifted1;
HANDLE_ERROR( cudaMalloc( (void**)&shifted1,
N * sizeof(float2) ) );
float2* shifted2;
HANDLE_ERROR( cudaMalloc( (void**)&shifted2,
N * sizeof(float2) ) );
float2* shifted3;
HANDLE_ERROR( cudaMalloc( (void**)&shifted3,
N * sizeof(float2) ) );
float2* shifted4;
HANDLE_ERROR( cudaMalloc( (void**)&shifted4,
N * sizeof(float2) ) );
float2* shifted5;
HANDLE_ERROR( cudaMalloc( (void**)&shifted5,
N * sizeof(float2) ) );
float2* shifted6;
HANDLE_ERROR( cudaMalloc( (void**)&shifted6,
N * sizeof(float2) ) );
float2* shifted7;
HANDLE_ERROR( cudaMalloc( (void**)&shifted7,
N * sizeof(float2) ) );
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// enqueue copies of a in stream0 and stream1
HANDLE_ERROR( cudaMemcpyAsync( dev_ref, host_ref,
sizeof(float2),
cudaMemcpyHostToDevice,
stream2 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_0, host_0,
sizeof(float2),
cudaMemcpyHostToDevice,
stream0 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_1, host_1,
sizeof(float2),
cudaMemcpyHostToDevice,
stream1 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_2, host_2,
sizeof(float2),
cudaMemcpyHostToDevice,
stream2 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_3, host_3,
sizeof(float2),
cudaMemcpyHostToDevice,
stream3 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_4, host_4,
sizeof(float2),
cudaMemcpyHostToDevice,
stream4 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_5, host_5,
sizeof(float2),
cudaMemcpyHostToDevice,
stream5 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_6, host_6,
sizeof(float2),
cudaMemcpyHostToDevice,
stream6 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_7, host_7,
sizeof(float2),
cudaMemcpyHostToDevice,
stream7 ) );
for(int i = 0; i < 100; i++){
shift<<<131072,131072,0>>>(dev_0, i, shifted0);
shift<<<131072,131072,0,stream1>>>(dev_1, i, shifted1);
shift<<<131072,131072,0,stream2>>>(dev_2, i, shifted2);
shift<<<131072,131072,0,stream3>>>(dev_3, i, shifted3);
shift<<<131072,131072,0,stream4>>>(dev_4, i, shifted4);
shift<<<131072,131072,0,stream5>>>(dev_5, i, shifted5);
shift<<<131072,131072,0,stream6>>>(dev_6, i, shifted6);
shift<<<131072,131072,0,stream7>>>(dev_7, i, shifted7);
conjugate<<<131072/256,131072,0,stream0>>>(shifted0);
conjugate<<<131072/256,131072,0,stream1>>>(shifted1);
conjugate<<<131072/256,131072,0,stream2>>>(shifted2);
conjugate<<<131072/256,131072,0,stream3>>>(shifted3);
conjugate<<<131072/256,131072,0,stream4>>>(shifted4);
conjugate<<<131072/256,131072,0,stream5>>>(shifted5);
conjugate<<<131072/256,131072,0,stream6>>>(shifted6);
conjugate<<<131072/256,131072,0,stream7>>>(shifted7);
multWithReference<<<131072/256,131072,0,stream0>>>(shifted0,dev_ref);
multWithReference<<<131072/256,131072,0,stream1>>>(shifted1,dev_ref);
multWithReference<<<131072/256,131072,0,stream2>>>(shifted2,dev_ref);
multWithReference<<<131072/256,131072,0,stream3>>>(shifted3,dev_ref);
multWithReference<<<131072/256,131072,0,stream4>>>(shifted4,dev_ref);
multWithReference<<<131072/256,131072,0,stream5>>>(shifted5,dev_ref);
multWithReference<<<131072/256,131072,0,stream6>>>(shifted6,dev_ref);
multWithReference<<<131072/256,131072,0,stream7>>>(shifted7,dev_ref);
if (cufftExecC2C(plan0, shifted0, shifted0, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan1, shifted1, shifted1, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan2, shifted2, shifted2, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan3, shifted3, shifted3, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan4, shifted4, shifted4, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan5, shifted5, shifted5, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan6, shifted6, shifted6, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan7, shifted7, shifted7, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
fftshift<<<131072,131072,0,stream0>>>(shifted0);
fftshift<<<131072,131072,0,stream1>>>(shifted1);
fftshift<<<131072,131072,0,stream2>>>(shifted2);
fftshift<<<131072,131072,0,stream3>>>(shifted3);
fftshift<<<131072,131072,0,stream4>>>(shifted4);
fftshift<<<131072,131072,0,stream5>>>(shifted5);
fftshift<<<131072,131072,0,stream6>>>(shifted6);
fftshift<<<131072,131072,0,stream7>>>(shifted7);
}
if (cudaThreadSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
}
float2 *host_last = (float2 *)malloc(8*131072);
// enqueue copies of c from device to locked memory
HANDLE_ERROR( cudaMemcpyAsync( host_last, shifted0,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream0 ) );
// enqueue copies of c from device to locked memory
HANDLE_ERROR( cudaMemcpyAsync( host_0, shifted0,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream0 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_1, shifted1,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream1 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_2, shifted2,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream2 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_3, shifted3,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream3 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_4, shifted4,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream4 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_5, shifted5,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream5 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_6, shifted6,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream6 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_7, shifted7,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream7 ) );
// Streamleri senkronize et
HANDLE_ERROR( cudaStreamSynchronize( stream0 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream1 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream2 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream3 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream4 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream5 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream6 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream7 ) );
// Stop timer
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
printf( "Time taken: %3.1f ms\n", elapsedTime );
FILE *fp2;
if(NULL == (fp2 = fopen("result.bin","wb+"))){
printf("can not open file...");
exit(1);
}
fwrite(host_last, sizeof(float2), 131072, fp2);
printf("signal written \n");
fflush(stdout);
fclose(fp2);
// cleanup the streams and memory
HANDLE_ERROR( cudaFreeHost( host_0 ) );
HANDLE_ERROR( cudaFreeHost( host_1 ) );
HANDLE_ERROR( cudaFreeHost( host_2 ) );
HANDLE_ERROR( cudaFreeHost( host_3 ) );
HANDLE_ERROR( cudaFreeHost( host_4 ) );
HANDLE_ERROR( cudaFreeHost( host_5 ) );
HANDLE_ERROR( cudaFreeHost( host_6 ) );
HANDLE_ERROR( cudaFreeHost( host_7 ) );
HANDLE_ERROR( cudaFree( dev_0 ) );
HANDLE_ERROR( cudaFree( dev_1 ) );
HANDLE_ERROR( cudaFree( dev_2 ) );
HANDLE_ERROR( cudaFree( dev_3 ) );
HANDLE_ERROR( cudaFree( dev_4 ) );
HANDLE_ERROR( cudaFree( dev_5 ) );
HANDLE_ERROR( cudaFree( dev_6 ) );
HANDLE_ERROR( cudaFree( dev_7 ) );
cufftDestroy(plan0);
cufftDestroy(plan1);
cufftDestroy(plan2);
cufftDestroy(plan3);
cufftDestroy(plan4);
cufftDestroy(plan5);
cufftDestroy(plan6);
cufftDestroy(plan7);
HANDLE_ERROR( cudaStreamDestroy( stream0 ) );
HANDLE_ERROR( cudaStreamDestroy( stream1 ) );
HANDLE_ERROR( cudaStreamDestroy( stream2 ) );
HANDLE_ERROR( cudaStreamDestroy( stream3 ) );
HANDLE_ERROR( cudaStreamDestroy( stream4 ) );
HANDLE_ERROR( cudaStreamDestroy( stream5 ) );
HANDLE_ERROR( cudaStreamDestroy( stream6 ) );
HANDLE_ERROR( cudaStreamDestroy( stream7 ) );
printf("hit [enter] to exit...");
fflush(stdout);
getchar();
return 0;
}
重现问题所需的二进制文件位于以下链接中:
当我在发布exe文件上运行“cuda-memcheck”时,我得到以下结果:
答案 0 :(得分:1)
在Nsight VSE中调试GPU代码时,需要通过Nsight菜单(“启动CUDA调试”)开始调试。有关详细信息,请参阅this walkthrough。
修改强>
根据您提供的其他信息,特别是cuda-memcheck
输出,您的内核似乎并未实际启动。错误9是cudaErrorInvalidConfiguration,表示启动配置(块,线程/块,模糊/块)与设备不兼容。
cudaErrorInvalidConfiguration = 9
这表明内核启动正在请求当前设备永远无法满足的资源。每个块请求比设备支持更多的共享内存将触发此错误,同时请求太多线程或块。有关更多设备限制,请参阅cudaDeviceProp。
实际上,您正在尝试启动131072个线程/块,这超出了限制(请参阅details的编程指南和specific limits)。您应该启动较小的块并相应地增加块数。
正如Robert Crovella所说,你应该始终确保你有适当的错误检查。