Question

为了了解我的计算能力，我有一个小程序：

cudaDeviceProp prop;
int count;
cudaGetDeviceCount( &count );
for (int i=0; i< count; i++) {
    cudaGetDeviceProperties( &prop, i );
    printf( "Compute capability:  %d.%d\n", prop.major, prop.minor );
}

并为我的所有GPU打印3.5。

现在我尝试编译以下玩具程序（由nvcc -c）：

__global__ void add_device(float *a, float *b, float *c, int n) 
{
    int i = blockIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

__global__ void add_kernel(float *a, float *b, float *c, int n) 
{
    add_device(a, b, c, n);
}

void gpu_add(float *a, float *b, float *c, int n) 
{
     add_kernel<<<n, 1>>>( a, b, c, n );
}

但我的编译导致以下错误：

calling a __global__ function("add_device") from a __global__ function("add_kernel") is only allowed on the compute_35 architecture or above

我做错了什么？

计算功能并从内核调用内核

0 个答案: