Question

我遇到了一个奇怪的效果：

#define CUDA_ERR_CHECK(call) call

#include <assert.h>
#include <iostream>

using namespace std;

#if defined(__CUDACC__)

// Determine the size of type on device.
template<typename T>
__global__ void deviceSizeOf(size_t* result)
{
    *result = sizeof(T);
}

// Device memory aligned vector.
template<typename T>
class VectorDevice
{
    T* data;
    size_t size;
    int dim, dim_aligned;

public :
    __host__ __device__
    VectorDevice() : data(NULL), size(0), dim(0) { }

    __host__ __device__
    VectorDevice(int dim_) : data(NULL), size(0), dim(dim_)
    {
        dim_aligned = dim_;
        if (dim_ % AVX_VECTOR_SIZE)
            dim_aligned = dim + AVX_VECTOR_SIZE - dim_ % AVX_VECTOR_SIZE;
#if !defined(__CUDA_ARCH__)
        // Determine the size of target type.
        size_t size, *dSize;
        CUDA_ERR_CHECK(cudaMalloc(&dSize, sizeof(size_t)));
        deviceSizeOf<T><<<1, 1>>>(dSize);
        CUDA_ERR_CHECK(cudaGetLastError());
        CUDA_ERR_CHECK(cudaDeviceSynchronize());
        CUDA_ERR_CHECK(cudaMemcpy(&size, dSize, sizeof(size_t), cudaMemcpyDeviceToHost));
        CUDA_ERR_CHECK(cudaFree(dSize));

        // Make sure the size of type is the same on host and on device.
        if (size != sizeof(T))
        {
            std::cerr << "Unexpected unequal sizes of type T in VectorDevice<T> on host and device" << std::endl;
            exit(2);
        }
#endif
    }
};

#endif // __CUDACC__

int main()
{
    VectorDevice<int> v(10);

    return 0;
}

这里，从__host__ __device__构造函数的主机版本调用内核。令人惊讶的是，在运行此代码时，它会以内核调用包装器中的代码1静默退出：

(gdb) make
nvcc -arch=sm_30 test.cu -o test -DAVX_VECTOR_SIZE=32
(gdb) b exit
Breakpoint 1 at 0x7ffff711b1e0: file exit.c, line 104.
(gdb) r
Breakpoint 1, __GI_exit (status=1) at exit.c:104
104 exit.c: No such file or directory.
(gdb) f 3
#3  0x0000000000402c36 in VectorDevice<int>::VectorDevice(int) ()
(gdb) f 2
#2  0x0000000000402cb0 in void deviceSizeOf<int>(unsigned long*) ()
(gdb) f 1
#1  0x0000000000402ad2 in void __wrapper__device_stub_deviceSizeOf<int>(unsigned long*&) ()
(gdb) disass
Dump of assembler code for function _Z35__wrapper__device_stub_deviceSizeOfIiEvRPm:
   0x0000000000402abc <+0>: push   %rbp
   0x0000000000402abd <+1>: mov    %rsp,%rbp
   0x0000000000402ac0 <+4>: sub    $0x10,%rsp
   0x0000000000402ac4 <+8>: mov    %rdi,-0x8(%rbp)
   0x0000000000402ac8 <+12>:    mov    $0x1,%edi
   0x0000000000402acd <+17>:    callq  0x402270 <exit@plt>
End of assembler dump.

进一步的研究表明内核代码没有出现在cubin中，并且__CUDA_ARCH__以某种方式涉及到这种行为。

所以，2个问题：

1）为什么会这样？

2）如何结合主机端内核调用使用__CUDA_ARCH__条件编译__host__ __device__代码？

谢谢！

更新： C编程指南第E.2.2.1节第2节中显示了相同的示例。但是，目前还不清楚处理这个问题的正确方法是什么。

Answer 1

1）为什么会这样？

之所以发生这种情况，是因为您正在踩踏编程指南中指出的the specific restriction：deviceSizeOf <int>的模板实例化必须在定义__CUDA_ARCH__时进行当它没有定义时。如果使用受限制的表单，则行为未定义。

2）如何结合主机端内核调用使用__CUDA_ARCH__条件编译__host__ __device__代码？

一种可能的方法是强制对<int>类型的内核函数进行实例化，而不管__CUDA_ARCH__宏。

您可以通过在内核模板定义后立即添加以下行来执行此操作：

template __global__ void deviceSizeOf<int>(size_t *);

当我在内核定义之后添加该行，并为AVX_VECTOR_SIZE提供合适的定义（在您的示例AFAICT中似乎未定义）时，您的代码将为我编译并正确运行。

Answer 2

我发现可以在没有显式模板实例化的情况下解决问题：

>>> d = {'census' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),'county' : pd.Series(['z','x','w','y'], index=['a', 'b', 'c', 'd'])}
>>> df = pd.DataFrame(d)
>>> df
   census  county
a  1.0      z
b  2.0      x
c  3.0      w
d  NaN      y
>>> df.ix[df['census'].argmax()]['county']
'w'

__CUDA_ARCH和host device__函数中的内核调用

2 个答案:

__CUDA_ARCH__和__host__ __device__函数中的内核调用

2 个答案:

__CUDA_ARCH和host device__函数中的内核调用