使用cudaMallocPitch分配1维数组,然后使用cudaMemcpy2D 3复制到设备

时间:2015-05-30 15:31:42

标签: c++ matrix cuda gpu sparse-matrix

我已阅读此帖Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D以及其他许多内容,包括NVIDIA文档,我无法让cudaMallocPitch与cudaMemcpy2D一起工作。

我需要以数组格式(Matrix [width * height])复制一个非常大的矩阵以及一个简单的数组来执行Matrix *向量运算。我不能选择使用cudaMallocPitch来避免冲突并获得更好的性能。

所以,我开始尝试将矩阵(在我的情况下为向量)复制到设备并检查它是否被正确复制但我的代码没有打印任何东西。如果我使用cudaMalloc和cudaMemcpy一切正常。但我不知道如何处理cudaMallocPitch和cudaMemcpy2D。

我该怎么做才能解决这个问题?

#include <stdio.h>
__global__ void kernel(size_t mpitch, double * A, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    while (idx < N)
    {
        double e = *(double *)(((char *) A + idx * mpitch) + N);
        printf("(%f)", e);
    }
}
int main()
{
    int N = 1500;
    double * A  = new double[N], * d_A;
    size_t pitch;

    for (int i = 0; i < N; ++i)
    {
        A[i] = i;
    }
    cudaMallocPitch(&d_A, &pitch,  sizeof(double) * N, 1);
    cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
    unsigned int blocksize = 1024;
    unsigned int nblocks = (N + blocksize - 1) / blocksize;
    kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
    cudaFree(d_A);
    delete [] A;
    return 0;
}

1 个答案:

答案 0 :(得分:2)

错误检查可以在调试中发挥重要作用。你应该在来这里之前使用它。

目前尚不清楚您是否需要行或列向量,即[1xN]或[Nx1]

的矩阵

我已经添加了关于Talomnies建议的解释,但首先是“工作的代码块”

这是[Nx1]

#include <cstdio>
#include <iostream>
#include <cuda.h>

using namespace std;

__global__ void kernel(size_t mpitch, double * A, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx>=N) return;
    double e = *(double *)(((char *) A + idx * mpitch));
    printf("(%f)", e);

}
int main()
{
    int N = 15;
    double * A  = new double[N], * d_A;
    size_t pitch;

    for (int i = 0; i < N; ++i)
    {
        A[i] = i;
    }

    cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
    if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;

    err = cudaMemcpy2D(d_A, pitch, A, sizeof(double), sizeof(double), N, cudaMemcpyHostToDevice);
    if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;

    unsigned int blocksize = 1024;
    unsigned int nblocks = (N + blocksize - 1) / blocksize;
    kernel <<<nblocks, blocksize>>>(pitch, d_A, N);

    cudaDeviceSynchronize();
    err = cudaGetLastError();
    if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;

    cudaFree(d_A);
    delete [] A;
    return 0;
}

[1×N个]:

#include <cstdio>
#include <iostream>
#include <cuda.h>

using namespace std;

__global__ void kernel(size_t mpitch, double * A, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx>=N) return;
    int row=0;//only one row

    double *row_ptr = (double *)( (char *) (A + mpitch * row) );
    double e = row_ptr[idx];
    printf("(%f)", e);

}
int main()
{
    int N = 15;
    double * A  = new double[N], * d_A;
    size_t pitch;

    for (int i = 0; i < N; ++i)
    {
        A[i] = i;
    }

    cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
    if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;

    err = cudaMemcpy2D(d_A, pitch, A, sizeof(double)*N, sizeof(double)*N, 1, cudaMemcpyHostToDevice);
    if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;

    unsigned int blocksize = 1024;
    unsigned int nblocks = (N + blocksize - 1) / blocksize;
    kernel <<<nblocks, blocksize>>>(pitch, d_A, N);

    cudaDeviceSynchronize();
    err = cudaGetLastError();
    if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;

    cudaFree(d_A);
    delete [] A;
    return 0;
}

<强>解释

Firslty,错误处理:

考虑到CUDA中错误处理的容易程度,没有一个很好的理由不把它放入。

cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;

其次,您没有指定是否需要列向量或行向量。由于行向量只是线性存储器中的1-D数组,并且您不需要调整内存来执行此操作,因此我将假设您的解释是指列向量。

您遇到的重复问题是内核中的“未对齐地址”。这表明问题是簿记,所以让我们来看看处理对齐的2D数组的三个主要步骤(即使我们的数组将是列或行向量)。

<强>清分: 您的分配写为

cudaMallocPitch(&d_A, &pitch,  sizeof(double) * N, 1);

这对于行向量是正确的,因为API是cudaMallocPitch(void*** pointer, size_t* pitch_return, size_t row_width_in_bytes, size_t count_of_rows)但是如果我们想要进行列向量正确的调用是

cudaMallocPitch(&d_A, &pitch, sizeof(double), N);

<强>访问: 为了访问,你混合访问一行,并访问行中的元素。

double e = *(double *)(((char *) A + idx * mpitch) + N);

再次坚持文档。 cudaMallocPitch的API文档包括

T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;

对我们而言,转化为

int column=0;
double element=(double*) ((char*)A + idx * mpitch) + column;

我使用column = 0来表示完整性,因为我们没有多列。

<强>复制

cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);

对于这种情况,这是正确的。 cudaMemcpy2D的API是

cudaMemcpy2D(void* destination, size_t pitch_from_mallocPitch, const void* source, size_t source_pitch_bytes, size_t src_width_in_bytes, size_t src_rows_count, enum type_of_xfer);