Question

下面的代码是合法的C ++（用g ++ -Wall编译清理）：

#ifdef MAKE_COMPILE
#define __restrict__ /* empty */
#define NO_FORWARD_DECLARATIONS
#endif

#include <stdio.h>

template <class T>
struct Array
{
int width, height;
T *ptr;
};

#ifdef HAVE_CUDA
template<typename T, int KernelSize>
     static __global__ void genConvolve_kernel(const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth )
{
    if ((threadIdx.x == 4) && (threadIdx.y == 2))
       printf("Hello world from CUDA!\n");
}
#endif

#ifndef NO_FORWARD_DECLARATIONS
template <typename T, int KernelSize>
     void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
#endif

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16)
{
#ifdef HAVE_CUDA
    dim3 block(blockWidth,blockHeight);
    dim3 grid(1,1);
    genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
#else
    printf("Hello, world!\n");
#endif
}

template <typename T, int KernelSize>
     void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16)
{
    genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
}

int main(int argc, char *argv[])
{
    Array<float> a;

    genConvolve_cuda<float,3>(a,a,a);
#ifdef HAVE_CUDA
    cudaDeviceSynchronize();
#endif

    return 0;
}

然而，当我尝试用nvcc编译它时，我得到错误：

nvcc t.cu

t.cu（39）：警告：在重新声明时指定默认参数   未引用的函数模板是非标准的

t.cu（39）：警告：重新定义默认参数

t.cu（51）：警告：在重新声明时指定默认参数   未引用的函数模板是非标准的

t.cu（51）：警告：重新定义默认参数

t.cu（53）：错误：模板实例化导致意外   函数类型“void（const float *，float *，int，int，int，int，   int）“（自模板以来名称的含义可能已更改）   声明 - 模板的类型是“void（const __restrict__ T   *，__ strict __ T *，int，int，int，int，int）“）             检测期间：               基于模板参数（53）实例化“genConvolve_cuda_deviceptrs”：这里               实例化“void genConvolve_cuda（const Array＆amp;，const Array＆amp ;,,,   Array＆amp;，int，int）[with T = float，KernelSize = 3]“（60）：here

（在发布之前清理示例时，行号略有偏移。）

当我定义-DMAKE_COMPILE时，警告和错误消失了;但是，我真的想在头文件中指定前向声明，并使用restrict！

所以有两个问题：

如果有默认函数参数（在我的情况下是blockWidth和blockHeight？），如何使用NVCC指定模板函数的前向声明
如何正确使用__restrict__与模板参数？

Answer 1

如何正确使用__restrict__模板参数？

在与同事商量后，有人指出这__restrict__用法：

const T __restrict__ * inputImageArray ...

值得怀疑。为了使__restrict__产生任何效果，应该将它放在星号和指针名称之间：

const T * __restrict__ inputImageArray ...

（gcc reference和CUDA reference）

在您所显示的非标准用法中，gcc似乎允许这样但是默默地“放弃”意图;在这种情况下，__restrict__的效果不适用。在这方面，CUDA确实与gcc行为不同。但是，由于上述问题的使用有问题，因此nvcc不太可能被修改为“修复”此问题。

如果切换到标准__restrict__用法，则可以使编译错误在您显示的代码中消失。无论如何，如果您的意图是向编译器声明这些实际上是受限制的指针，那么建议这样做：

#ifdef MAKE_COMPILE
#define __restrict__ /* empty */
#define NO_FORWARD_DECLARATIONS
#endif

#include <stdio.h>

template <class T>
struct Array
{
int width, height;
T *ptr;
};

#ifdef HAVE_CUDA
template<typename T, int KernelSize>
     static __global__ void genConvolve_kernel(const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth )
{
    if ((threadIdx.x == 4) && (threadIdx.y == 2))
       printf("Hello world from CUDA!\n");
}
#endif

#ifndef NO_FORWARD_DECLARATIONS
template <typename T, int KernelSize>
     void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
#endif

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16)
{
#ifdef HAVE_CUDA
    dim3 block(blockWidth,blockHeight);
    dim3 grid(1,1);
    genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
#else
    printf("Hello, world!\n");
#endif
}

template <typename T, int KernelSize>
     void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16)
{
    genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
}

int main(int argc, char *argv[])
{
    Array<float> a;

    genConvolve_cuda<float,3>(a,a,a);
#ifdef HAVE_CUDA
    cudaDeviceSynchronize();
#endif

    return 0;
}

警告仍然存在;这似乎是一个单独的问题：

t986.cu（33）：警告：在重新声明未引用的函数模板时指定默认参数是非标准的

t986.cu（33）：警告：重新定义默认参数

t986.cu（45）：警告：在重新声明未引用的函数模板时指定默认参数是非标准的

t986.cu（45）：警告：重新定义默认参数

如果默认（模板）函数参数包含在第一个声明中而不是后续声明中，则可以使这些警告消失，如下所示：

#ifdef MAKE_COMPILE
#define __restrict__ /* empty */
#define NO_FORWARD_DECLARATIONS
#endif

#include <stdio.h>

template <class T>
struct Array
{
int width, height;
T *ptr;
};

#ifdef HAVE_CUDA
template<typename T, int KernelSize>
     static __global__ void genConvolve_kernel(const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth )
{
    if ((threadIdx.x == 4) && (threadIdx.y == 2))
       printf("Hello world from CUDA!\n");
}
#endif

#ifndef NO_FORWARD_DECLARATIONS
template <typename T, int KernelSize>
     void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
#endif

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth, int blockHeight)
{
#ifdef HAVE_CUDA
    dim3 block(blockWidth,blockHeight);
    dim3 grid(1,1);
    genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
#else
    printf("Hello, world!\n");
#endif
}

template <typename T, int KernelSize>
     void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth, int blockHeight)
{
    genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
}

int main(int argc, char *argv[])
{
    Array<float> a;

    genConvolve_cuda<float,3>(a,a,a);
#ifdef HAVE_CUDA
    cudaDeviceSynchronize();
#endif

    return 0;
}

虽然我同意仍然不同于g ++行为。然而，这里的gnu工具可能仍然是不寻常的情况。重新定义默认参数仍然是意料之外的，clang和cl.exe（微软）都会遇到问题。

nvcc处理restrict和模板函数的默认参数

1 个答案:

nvcc处理__restrict__和模板函数的默认参数

1 个答案:

nvcc处理restrict和模板函数的默认参数