Question

在CUDA文档中，我发现cudaDeviceGetAttribute是一个__host__ __device__函数。所以我想我可以在我的__global__函数中调用它来获取我的设备的一些属性。遗憾的是，它似乎意味着不同的东西，因为如果我将它放入__device__函数并从我的全局函数中调用它，我会收到编译错误事件。

是否可以在我的GPU上调用cudaDeviceGetAttribute？或者__host__ __device__还有什么意思？

这是我的源代码：

__device__ void GetAttributes(int* unique)
{
    cudaDeviceAttr attr = cudaDevAttrMaxThreadsPerBlock;
    cudaDeviceGetAttribute(unique, attr, 0);
}


__global__ void ClockTest(int* a, int* b, long* return_time, int* unique)
{
    clock_t start = clock();

    //some complex calculations

    *a = *a + *b;
    *b = *a + *a;

    GetAttributes(unique);

    *a = *a + *b - *a;

    clock_t end = clock();
    *return_time = end - start;
}

int main()
{
    int a = 2;
    int b = 3;
    long time = 0;
    int uni;

    int* dev_a;
    int* dev_b;
    long* dev_time;
    int* unique;

    for (int i = 0; i < 10; ++i) {

        cudaMalloc(&dev_a, sizeof(int));
        cudaMalloc(&dev_b, sizeof(int));
        cudaMalloc(&dev_time, sizeof(long));
        cudaMalloc(&unique, sizeof(int));

        cudaMemcpy(dev_a, &a, sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(dev_b, &b, sizeof(int), cudaMemcpyHostToDevice);

        ClockTest <<<1,1>>>(dev_a, dev_b, dev_time, unique);

        cudaMemcpy(&a, dev_a, sizeof(int), cudaMemcpyDeviceToHost);
        cudaMemcpy(&time, dev_time, sizeof(long), cudaMemcpyDeviceToHost);
        cudaMemcpy(&uni, unique, sizeof(int), cudaMemcpyDeviceToHost);

        cudaFree(&dev_a);
        cudaFree(&dev_b);
        cudaFree(&dev_time);
        cudaFree(&unique);

        printf("%d\n", time);
        printf("unique: %d\n", uni);

        cudaDeviceReset();
    }

    return 0;
}

Answer 1

编辑：抱歉，我之前的回答不正确。 nvcc似乎存在问题（见下文）。

cudaDeviceGetAttribute可以在设备代码中正常工作，这是K20X，CUDA 8.0.61上的一个有效例子：

$ cat t1305.cu
#include <stdio.h>

__global__ void tkernel(){

  int val;
  cudaError_t err = cudaDeviceGetAttribute(&val, cudaDevAttrMaxThreadsPerBlock, 0);
  printf("err = %d, %s\n", err, cudaGetErrorString(err));
  printf("val = %d\n", val);

}


int main(){

  tkernel<<<1,1>>>();
  cudaDeviceSynchronize();
}


$ nvcc -arch=sm_35 -o t1305 t1305.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t1305
========= CUDA-MEMCHECK
err = 0, no error
val = 1024
========= ERROR SUMMARY: 0 errors
$

有various runtime API functions supported for use in device code。对于受支持的运行时API函数，通常需要：

编译cc 3.5或更高版本的设备
使用可重定位设备代码进行编译
链接cuda设备运行时库

此外，您的代码还有一些其他编码错误，因为我们没有将指针的地址传递给cudaFree，只是指针本身。

这个特殊功能的注意事项：

在CUDA编译器中似乎存在一个问题，即如果在内核代码中没有任何其他运行时API调用的情况下使用此设备运行时API调用，则代码生成将不会正确发生。此时的解决方法是确保您的内核至少包含一个其他cuda运行时API调用。在上面的示例中，我使用了cudaGetErrorString，但您可以使用我想，请使用cudaDeviceSynchronize()或其他任何内容。我已提交内部NVIDIA错误报告此问题。
编程指南（上面的链接）的CDP部分支持的设备运行时API调用列表中似乎存在文档错误。函数cudaGetDeviceProperty不存在，但我认为它应该引用cudaDeviceGetAttribute。我已针对此文档错误提交了内部NVIDIA错误。

我不能从device函数调用host device函数吗？

1 个答案:

我不能从__device__函数调用__host__ __device__函数吗？

1 个答案:

我不能从device函数调用host device函数吗？