为了了解我的计算能力,我有一个小程序:
cudaDeviceProp prop;
int count;
cudaGetDeviceCount( &count );
for (int i=0; i< count; i++) {
cudaGetDeviceProperties( &prop, i );
printf( "Compute capability: %d.%d\n", prop.major, prop.minor );
}
并为我的所有GPU打印3.5。
现在我尝试编译以下玩具程序(由nvcc -c):
__global__ void add_device(float *a, float *b, float *c, int n)
{
int i = blockIdx.x;
if (i < n) {
c[i] = a[i] + b[i];
}
}
__global__ void add_kernel(float *a, float *b, float *c, int n)
{
add_device(a, b, c, n);
}
void gpu_add(float *a, float *b, float *c, int n)
{
add_kernel<<<n, 1>>>( a, b, c, n );
}
但我的编译导致以下错误:
calling a __global__ function("add_device") from a __global__ function("add_kernel") is only allowed on the compute_35 architecture or above
我做错了什么?