Question

我试图编写矩阵转置算法。我测试这个程序的矩阵大小等于1024，结果表明并非所有元素都在正确的位置。

为什么我的数组没有正确转置？有没有人可以帮助我或给我任何暗示？我会很感激的。非常感谢！

有整个cpu代码：

__global__ void transpose_naive (float *out, float *in, int w, int h )
{
    unsigned int xIdx = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int yIdx = blockDim.y * blockIdx.y + threadIdx.y;
    if ( xIdx <=w && yIdx <=h ) {
        unsigned int idx_in = xIdx + w * yIdx;
        unsigned int idx_out = yIdx + h * xIdx;
        out[idx_out] = in[idx_in];
    }
}

int main()

{
    int nx=1024;
    int mem_size = nx*nx*sizeof(float);
    int t=32;
    dim3 dimGrid(((nx-1)/t) +1, ((nx-1)/t) +1);
    dim3 dimBlock(t,t);

    float *h_idata = (float*)malloc(mem_size);
    float *h_cdata = (float*)malloc(mem_size);
    float *d_idata, *d_cdata;
    checkCuda(cudaMalloc(&d_idata, mem_size) );
    checkCuda(cudaMalloc(&d_cdata, mem_size) );   
    // host
    for (int j = 0; j < nx; j++)
        for (int i = 0; i < nx; i++)
            h_idata[j*nx + i] = j*nx + i;

    // device
    checkCuda(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) );

    // events for timing
    cudaEvent_t startEvent, stopEvent;
    checkCuda(cudaEventCreate(&startEvent) );
    checkCuda(cudaEventCreate(&stopEvent) );
    float ms;
    checkCuda( cudaEventRecord(startEvent, 0) );
    transpose_naive<<<dimGrid, dimBlock>>>(d_cdata, d_idata,nx,nx);
    checkCuda(cudaEventRecord(stopEvent, 0) );
    checkCuda(cudaEventSynchronize(stopEvent) );
    checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent) );
    checkCuda( cudaMemcpy(h_cdata, d_cdata, mem_size, cudaMemcpyDeviceToHost) );

    printf("the time %5f ", ms);
    printf("\n");
    savetofile(h_idata,"i.txt",nx,nx);
    savetofile(h_cdata,"t.txt",nx,nx);

error_exit:
    // cleanup
    checkCuda(cudaEventDestroy(startEvent) );
    checkCuda(cudaEventDestroy(stopEvent) );
    checkCuda( cudaFree(d_cdata) );
    checkCuda( cudaFree(d_idata) );
    free(h_idata);
    free(h_cdata);
    system("pause"); 
}

Answer 1

我认为文件输出“i.txt”和“t.txt”有问题，否则程序看起来是正确的。我通过在标准输出流上添加错误检查和打印，对代码进行了一些小的更改。我正在打印最后一个（1020 - 1024）3 x 3矩阵以交叉检查转置。在您的系统上运行它并验证矩阵转置是否正确？

#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
#include "device_launch_parameters.h"

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file, line);
    if (abort) exit(code);
}
}

__global__ void transpose_naive(float *out, float *in, int w, int h)
{
unsigned int xIdx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIdx = blockDim.y * blockIdx.y + threadIdx.y;

if (xIdx <= w && yIdx <= h) {
    unsigned int idx_in = xIdx + w * yIdx;
    unsigned int idx_out = yIdx + h * xIdx;
    out[idx_out] = in[idx_in];
}
}

int main()

{
int nx = 1024;
int mem_size = nx*nx*sizeof(float);
int t = 32;
dim3 dimGrid(((nx - 1) / t) + 1, (((nx - 1) / t) + 1));
dim3 dimBlock(t, t);

float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *d_idata, *d_cdata;
gpuErrchk(cudaMalloc(&d_idata, mem_size));
gpuErrchk(cudaMalloc(&d_cdata, mem_size));
// host
for (int j = 0; j < nx; j++)
    for (int i = 0; i < nx; i++)
        h_idata[j*nx + i] = j*nx + i;

// device
gpuErrchk(cudaMemcpy(d_idata,h_idata,mem_size,cudaMemcpyHostToDevice));

// events for timing
cudaEvent_t startEvent, stopEvent;
gpuErrchk(cudaEventCreate(&startEvent));
gpuErrchk(cudaEventCreate(&stopEvent));
float ms;
gpuErrchk(cudaEventRecord(startEvent, 0));
transpose_naive << <dimGrid, dimBlock >> >(d_cdata, d_idata, nx, nx);
gpuErrchk(cudaEventRecord(stopEvent, 0));
gpuErrchk(cudaEventSynchronize(stopEvent));
gpuErrchk(cudaEventElapsedTime(&ms, startEvent, stopEvent));
gpuErrchk(cudaMemcpy(h_cdata,d_cdata,mem_size,cudaMemcpyDeviceToHost));

printf("the time %5f ", ms);
printf("\n");
for (int i = 1020; i < 1024; i++) {
    for (int j = 1020; j < 1024; j++) {
    printf("%.2f ", h_idata[i*nx + j]);
        }
        printf("\n");
}

printf("\n");
for (int i = 1020; i < 1024; i++) {
    for (int j = 1020; j < 1024; j++) {
        printf("%.2f ", h_cdata[i*nx + j]);
    }
    printf("\n");
}
//savetofile(h_idata, "i.txt", nx, nx);
//savetofile(h_cdata, "t.txt", nx, nx);
//error_exit:
// cleanup
gpuErrchk(cudaEventDestroy(startEvent));
gpuErrchk(cudaEventDestroy(stopEvent));
gpuErrchk(cudaFree(d_cdata));
gpuErrchk(cudaFree(d_idata));
free(h_idata);
free(h_cdata);
//system("pause");
}

Answer 2

代码中唯一的缺陷是内核的以下行中的错误绑定检查。

if ( xIdx <=w && yIdx <=h ) {

由于x和y维度的索引分别从0到w-1和0到h-1，因此if条件应如下所示：

if ( xIdx <w && yIdx <h ) {

方阵使用CUDA进行转置

2 个答案: