从设备到主机的cudaMemcpy错误

时间:2016-10-09 23:49:45

标签: cuda

我在内核上计算后从设备到主机返回一个二维结构。

HANDLE_ERROR(cudaMemcpy(Pixel,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));

Pixel在主机上声明,Pixel_gpu在设备上分配,如下所示:

**Pixel_gpu;
HANDLE_ERROR(cudaMalloc(&Pixel_gpu,img_wd*img_ht*sizeof(pixel)));

pixel **Pixel = (pixel**)malloc((img_ht)*sizeof(pixel*));
for(int i=0;i<(img_ht);i++)
    Pixel[i]=(pixel*)malloc((img_wd)*sizeof(pixel));

使用这个我最终得到非法的内存访问错误。

为结果尝试类似的内存对齐,也没有帮助。

 pixel *Pixel_res = (pixel*)malloc(img_wd*img_ht*sizeof(pixel));



HANDLE_ERROR(cudaMemcpy(Pixel_res,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));

内核启动:

cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));


int thread_block=sqrt(prop.maxThreadsPerBlock);
dim3 DimGrid(ceil(img_wd/thread_block),ceil(img_ht/thread_block),1);
dim3 DimBlock(sqrt(prop.maxThreadsPerBlock),sqrt(prop.maxThreadsPerBlock),1);

//allocating gpu memory


pixel **Pixel_tmp_gpu, **Pixel_gpu;


HANDLE_ERROR(cudaMalloc(&Pixel_tmp_gpu,img_wd*img_ht*sizeof(pixel)));
HANDLE_ERROR(cudaMalloc(&Pixel_gpu,img_wd*img_ht*sizeof(pixel)));


float **kernel0_gpu, **kernel1_gpu;

HANDLE_ERROR(cudaMalloc(&kernel0_gpu,k*1*sizeof(float)));
HANDLE_ERROR(cudaMalloc(&kernel1_gpu,1*k*sizeof(float)));

cout<<"memory allocated"<<endl;

//copying needed data

HANDLE_ERROR(cudaMemcpy(Pixel_tmp_gpu,Pixel_tmp,img_wd*img_ht*sizeof(pixel),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(Pixel_gpu,Pixel,img_wd*img_ht*sizeof(pixel),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernel0_gpu,kernel0,k*1*sizeof(float),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernel1_gpu,kernel1,1*k*sizeof(float),cudaMemcpyHostToDevice));

cout<<"memory transfers done"<<endl;

vertical_conv<<<DimGrid,DimBlock>>>(Pixel_gpu, Pixel_tmp_gpu,img_wd, img_ht,kernel0_gpu,k);
time_t vertical_convolution=time(NULL);

cout<<" vertical_convolution time: "<<double(vertical_convolution - reading_file)<<"sec"<<endl;


horizontal_conv<<<DimGrid,DimBlock>>>(Pixel_tmp_gpu, Pixel_gpu, img_wd, img_ht, kernel1_gpu, k);
time_t horizontal_convolution=time(NULL);

cout<<" horizontal convolution time:" <<double(horizontal_convolution-vertical_convolution)<<" sec"<<endl;

pixel *Pixel_res = (pixel*)malloc(img_wd*img_ht*sizeof(pixel));

HANDLE_ERROR(cudaMemcpy(Pixel_res,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));

使用的功能:

struct  pixel //to store RGB values
{
    unsigned char r;
    unsigned char g;
    unsigned char b;
};

static void HandleError( cudaError_t err, const char *file, int line ) {
    if (err != cudaSuccess) {
        cout<<cudaGetErrorString(err)<<" in "<< file <<" at line "<< line<<endl;
    }
}

#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

__device__ void padding(pixel** Pixel_val, int x_coord, int y_coord, int img_width, int img_height, pixel Px) //padding the image,depending on pixel coordinates, can be replaced by reflect for better result //currently zero padding
{
    if(x_coord<img_width && y_coord<img_height && x_coord>=0 && y_coord>=0) 
        Px=Pixel_val[y_coord][x_coord];
}

垂直卷积:

__global__ void vertical_conv(pixel** Pixel_in, pixel** Pixel_out,int img_wd, int img_ht, float** kernel, int k)
{
    float tmp_r, tmp_g, tmp_b;
    pixel pix_val;
    pix_val.r=0;pix_val.g=0;pix_val.b=0;
    int row=blockIdx.y*blockDim.y + threadIdx.y;

    int col = blockIdx.x*blockDim.x + threadIdx.x;
    if(row<img_ht && col<img_wd){
        tmp_r=0, tmp_g=0, tmp_b=0;
        for(int l=0;l<k;l++)
        {   

            padding(Pixel_in, col, row+l-(k-1)/2, img_wd, img_ht, pix_val);
            tmp_r+=pix_val.r * kernel[l][0];
            tmp_b+=pix_val.b * kernel[l][0];
            tmp_g+=pix_val.g * kernel[l][0];
        }

        Pixel_out[row][col].r=tmp_r;
        Pixel_out[row][col].g=tmp_g;
        Pixel_out[row][col].b=tmp_b;
    }
}

水平卷积:

   __global__ void horizontal_conv(pixel** Pixel_in, pixel** Pixel_out, int img_wd, int img_ht, float** kernel, int k)
{
    float tmp_r, tmp_b, tmp_g;
    pixel pix_val;
    pix_val.r=0;pix_val.g=0;pix_val.b=0;

    //horizontal convolution
    int row=blockIdx.y*blockDim.y + threadIdx.y;

    int col = blockIdx.x*blockDim.x + threadIdx.x;
    tmp_r=0, tmp_g=0, tmp_b=0;
    if(row<img_ht && col<img_wd)
    {
        for(int l=0; l<k;l++)
        {
            padding(Pixel_in, col+l-(k-1)/2, row, img_wd, img_ht, pix_val);
            tmp_r+=pix_val.r * kernel[0][l];
            tmp_g+=pix_val.g * kernel[0][l];
            tmp_b+=pix_val.b * kernel[0][l];
        }
        Pixel_out[row][col].r=tmp_r;
        Pixel_out[row][col].g=tmp_g;
        Pixel_out[row][col].b=tmp_b;
    }
}

有人能帮助我知道这里可能出现什么问题吗?

1 个答案:

答案 0 :(得分:2)

Pixel_gpu一个连续内存块,由w*hpixel个元素组成。它的大小是

sizeOfDeviceMemory = img_wd * img_ht * sizeof(pixel)

与此相反,CPU端的Pixel是一个“指针数组”:Pixel指针指向h类型的pixel*元素。它的大小是

sizeOfHostMemory = img_ht * sizeof(pixel*)

显然,这些大小不同,尝试将sizeOfDeviceMemory个字节写入此指针会导致非法访问。

通常,您应该将主机上的内存分配为一个连续的块:

pixel* Pixel = (pixel*)malloc(img_wd * img_ht * sizeof(pixel));

然后,您可以使用已有的cudaMemcpy调用将内存复制到此指针。

如果主机上有pixel*对你不好,而你迫切需要一个pixel**(例如,将其传递给其他函数),那么你可以创建一个“数组指针“就像你以前一样,但为每一行分配新内存,而是让每个指针指向单个连续像素块的一个”行“。