Question

我正在尝试使用CUDA的推力库修改CUDA中的简单动态矢量。但是我在屏幕上出现“launch_closure_by_value”错误，表明错误与某些同步过程有关。

由于此错误，无法进行简单的1D动态数组修改。

导致错误的我的代码段如下。

从.cpp文件

我调用setIndexedGrid，它在System.cu中定义

float* a= (float*)(malloc(8*sizeof(float))); 
a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7;
float* b = (float*)(malloc(8*sizeof(float)));
setIndexedGridInfo(a,b);

System.cu上的代码段：

void
setIndexedGridInfo(float* a, float*b)
{

    thrust::device_ptr<float> d_oldData(a);
    thrust::device_ptr<float> d_newData(b);

    float c = 0.0;

    thrust::for_each(
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData,d_newData)),
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData+8,d_newData+8)),
        grid_functor(c));
}

grid_functor在_kernel.cu中定义

struct grid_functor
{
    float a;

    __host__ __device__
    grid_functor(float grid_Info) : a(grid_Info) {}

    template <typename Tuple>
    __device__
    void operator()(Tuple t)
    {
        volatile float data = thrust::get<0>(t);
        float pos = data + 0.1;
        thrust::get<1>(t) = pos;
    }

};

我也在Output窗口（我使用Visual Studio）上获取这些：

Particles.exe中0x000007fefdc7cacd的第一次机会异常： Microsoft C ++异常：内存位置的cudaError_enum 0x0029eb60 .. 0x000007fefdc7cacd中的第一次机会异常 smokeParticles.exe：Microsoft C ++异常： thrust :: system :: system_error在内存位置0x0029ecf0 ..未处理 Particles.exe中的0x000007fefdc7cacd异常：Microsoft C ++ 异常：在内存位置的thrust :: system :: system_error 0x0029ecf0 ..

导致问题的原因是什么？

Answer 1

您正在尝试在期望设备内存中的指针的函数中使用主机内存指针。这段代码就是问题所在：

float* a= (float*)(malloc(8*sizeof(float))); 
a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7;
float* b = (float*)(malloc(8*sizeof(float)));
setIndexedGridInfo(a,b);

.....

thrust::device_ptr<float> d_oldData(a);
thrust::device_ptr<float> d_newData(b);

thrust::device_ptr用于“包装”使用CUDA API分配的设备内存指针，以便推力可以使用它。您正在尝试将主机指针直接视为设备指针。这是非法的。您可以像这样修改setIndexedGridInfo函数：

void setIndexedGridInfo(float* a, float*b, const int n)
{

    thrust::device_vector<float> d_oldData(a,a+n);
    thrust::device_vector<float> d_newData(b,b+n);

    float c = 0.0;

    thrust::for_each(
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData.begin(),d_newData.begin())),
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData.end(),d_newData.end())),
        grid_functor(c));
}

device_vector构造函数将分配设备内存，然后将主机内存的内容复制到设备。这应该可以解决你看到的错误，虽然我不确定你要对for_each迭代器做什么，以及你有没有正确的仿函数。

修改

以下是代码的完整，可编译，可运行的版本：

#include <cstdlib> #include <cstdio> #include <thrust/device_vector.h> #include <thrust/for_each.h> #include <thrust/copy.h> struct grid_functor { float a; __host__ __device__ grid_functor(float grid_Info) : a(grid_Info) {} template <typename Tuple> __device__ void operator()(Tuple t) { volatile float data = thrust::get<0>(t); float pos = data + 0.1f; thrust::get<1>(t) = pos; } }; void setIndexedGridInfo(float* a, float*b, const int n) { thrust::device_vector<float> d_oldData(a,a+n); thrust::device_vector<float> d_newData(b,b+n); float c = 0.0; thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple(d_oldData.begin(),d_newData.begin())), thrust::make_zip_iterator(thrust::make_tuple(d_oldData.end(),d_newData.end())), grid_functor(c)); thrust::copy(d_newData.begin(), d_newData.end(), b); } int main(void) { const int n = 8; float* a= (float*)(malloc(n*sizeof(float))); a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7; float* b = (float*)(malloc(n*sizeof(float))); setIndexedGridInfo(a,b,n); for(int i=0; i<n; i++) { fprintf(stdout, "%d (%f,%f)\n", i, a[i], b[i]); } return 0; }

我可以使用CUDA 4.1在OS 10.6.8主机上编译和运行此代码，如下所示：

$ nvcc -Xptxas="-v" -arch=sm_12 -g -G thrustforeach.cu ./thrustforeach.cu(18): Warning: Cannot tell what pointer points to, assuming global memory space ./thrustforeach.cu(20): Warning: Cannot tell what pointer points to, assuming global memory space ./thrustforeach.cu(18): Warning: Cannot tell what pointer points to, assuming global memory space ./thrustforeach.cu(20): Warning: Cannot tell what pointer points to, assuming global memory space ptxas info : Compiling entry function '_ZN6thrust6detail7backend4cuda6detail23launch_closure_by_valueINS2_18for_each_n_closureINS_12zip_iteratorINS_5tupleINS0_15normal_iteratorINS_10device_ptrIfEEEESB_NS_9null_typeESC_SC_SC_SC_SC_SC_SC_EEEEi12grid_functorEEEEvT_' for 'sm_12' ptxas info : Used 14 registers, 160+0 bytes lmem, 16+16 bytes smem, 4 bytes cmem[1] ptxas info : Compiling entry function '_ZN6thrust6detail7backend4cuda6detail23launch_closure_by_valueINS2_18for_each_n_closureINS_12zip_iteratorINS_5tupleINS0_15normal_iteratorINS_10device_ptrIfEEEESB_NS_9null_typeESC_SC_SC_SC_SC_SC_SC_EEEEj12grid_functorEEEEvT_' for 'sm_12' ptxas info : Used 14 registers, 160+0 bytes lmem, 16+16 bytes smem, 4 bytes cmem[1] $ ./a.out 0 (0.000000,0.100000) 1 (1.000000,1.100000) 2 (2.000000,2.100000) 3 (3.000000,3.100000) 4 (4.000000,4.100000) 5 (5.000000,5.100000) 6 (6.000000,6.100000) 7 (7.000000,7.100000)

如何解决CUDA Thrust库 - for_each同步错误？

1 个答案: