Question

我在使用CUDA的.cu文件中有以下代码：

#include "gpu_stgauss2.h"
#include "gpu_st.h"
#include "gpu_sampler.h"

static texture<float, 2, cudaReadModeElementType> s_texSRC1;
static texture<float4, 2, cudaReadModeElementType> s_texSRC4;

inline __host__ __device__ texture<float,2>& texSRC1() { return s_texSRC1; }
inline __host__ __device__ texture<float4,2>& texSRC4() { return s_texSRC4; }

static texture<float4, 2, cudaReadModeElementType> s_texST;
inline __host__ __device__ texture<float4,2>& texST() { return s_texST; }

它们稍后在同一文件中使用如下：

gpu_image<float> gpu_stgauss2_filter( const gpu_image<float>& src, const gpu_image<float4>& st, 
                                      float sigma, float max_angle, bool adaptive,
                                      bool src_linear, bool st_linear, int order, float step_size,
                                      float precision )
{     
    if (sigma <= 0) return src;
    gpu_image<float> dst(src.size());

    gpu_sampler<float, texSRC1> src_sampler(src, src_linear? cudaFilterModeLinear : cudaFilterModePoint);
    float cos_max = cosf(radians(max_angle));

    if (src.size() == st.size()) {
        gpu_sampler<float4, texST> st_sampler(st, st_linear? cudaFilterModeLinear : cudaFilterModePoint);
        if (order == 1) imp_stgauss2_filter<1,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
        else if (order == 2) imp_stgauss2_filter<2,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
        else if (order == 4) imp_stgauss2_filter<4,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
    } else {
        float2 s = make_float2((float)st.w() / src.w(), (float)st.h() / src.h());
        gpu_resampler<float4, texST> st_sampler(st, s, st_linear? cudaFilterModeLinear : cudaFilterModePoint);
        if (order == 1) imp_stgauss2_filter<1,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
        else if (order == 2) imp_stgauss2_filter<2,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
        else if (order == 4) imp_stgauss2_filter<4,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
    }
    GPU_CHECK_ERROR();
    return dst;
}


gpu_image<float4> gpu_stgauss2_filter( const gpu_image<float4>& src, const gpu_image<float4>& st, 
                                       float sigma, float max_angle, bool adaptive,
                                       bool src_linear, bool st_linear, int order, float step_size,
                                       float precision )
{     
    if (sigma <= 0) return src;
    gpu_image<float4> dst(src.size());

    gpu_sampler<float4, texSRC4> src_sampler(src, src_linear? cudaFilterModeLinear : cudaFilterModePoint);
    float cos_max = cosf(radians(max_angle));

    if (src.size() == st.size()) {
        gpu_sampler<float4, texST> st_sampler(st, st_linear? cudaFilterModeLinear : cudaFilterModePoint);
        if (order == 1) imp_stgauss2_filter<1,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
        else if (order == 2) imp_stgauss2_filter<2,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
        else if (order == 4) imp_stgauss2_filter<4,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
    } else {
        float2 s = make_float2((float)st.w() / src.w(), (float)st.h() / src.h());
        gpu_resampler<float4, texST> st_sampler(st, s, st_linear? cudaFilterModeLinear : cudaFilterModePoint);
        if (order == 1) imp_stgauss2_filter<1,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
        else if (order == 2) imp_stgauss2_filter<2,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
        else if (order == 4) imp_stgauss2_filter<4,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
    }
    GPU_CHECK_ERROR();
    return dst;
}

但是，它会导致以下错误：

error : taking reference of texture/surface variable not allowed in __device__/__global__ functions

我对CUDA的经验很少。有人可以帮忙解决这个问题吗？感谢。

Answer 1

编译错误说明了一切：你不能做你尝试过的事情。我建议直接使用变量（而不是通过texSRC1()等访问它们，或者返回指针而不是引用。

Answer 2

我强烈建议在cc 3.0 ++中使用无绑定纹理，因为unbind纹理命令不必同步主机线程

第二，你应该考虑使用CC 3.0 +中提出的新现金存储器，为了做到这一点，请简单指定存储器为

 const float* pArray;

第3，如果您坚持使用旧时尚纹理，这对于插值操作非常有用。在全球范围内：

   texture     <float, cudaTextureType1D> textureFloat32_1D;

在你的代码中绑定纹理

cudaBindTexture（NULL，textureFloat32_1D，...）;

在内核中使用你想要的纹理......

float a = tex1Dfatch（textureFloat32_1D，location）;

内核之外

cudaUnbindTexture（textureFloat32_1D）;

请注意，使用CUDA代码的多线程应用程序在使用与案例3中提到的相同的纹理变量时会遇到问题（它没有受到保护！）

Answer 3

尝试将您的CUDA降级到4.0。在CUDA 4.0中可以使用这种代码语法。我曾经遇到类似的问题，CUDA 4.0适合我。

Answer 4

对于任何有同样问题的人，在这种情况下是来自发现here的GPU库，我设法通过调整其他地方使用的相同策略来解决它，例如“gpu_stbf2.cu”。我成功地使用Cuda 6.0和Visual Studio 2012 x64进行了编译。

Answer 5

我在尝试编译完全相同的代码时遇到了同样的问题。事实证明，使用ennetws建议的'gpu_stbf2.cu'中的技巧，这里不需要返回引用。

这三个函数实际上只在这个文件中调用，所以将gpu_sampler.h中的struct definiton移回到这里，而不是通过调用这些函数来获取纹理，你可以直接使用它。我把代码放在github上here。

错误：在device / global函数中不允许引用纹理/表面变量

5 个答案:

错误：在__device __ / __ global__函数中不允许引用纹理/表面变量

5 个答案:

错误：在device / global函数中不允许引用纹理/表面变量