我试图编写矩阵转置算法。我测试这个程序的矩阵大小等于1024,结果表明并非所有元素都在正确的位置。
为什么我的数组没有正确转置?有没有人可以帮助我或给我任何暗示?我会很感激的。非常感谢!
有整个cpu代码:
__global__ void transpose_naive (float *out, float *in, int w, int h )
{
unsigned int xIdx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIdx = blockDim.y * blockIdx.y + threadIdx.y;
if ( xIdx <=w && yIdx <=h ) {
unsigned int idx_in = xIdx + w * yIdx;
unsigned int idx_out = yIdx + h * xIdx;
out[idx_out] = in[idx_in];
}
}
int main()
{
int nx=1024;
int mem_size = nx*nx*sizeof(float);
int t=32;
dim3 dimGrid(((nx-1)/t) +1, ((nx-1)/t) +1);
dim3 dimBlock(t,t);
float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *d_idata, *d_cdata;
checkCuda(cudaMalloc(&d_idata, mem_size) );
checkCuda(cudaMalloc(&d_cdata, mem_size) );
// host
for (int j = 0; j < nx; j++)
for (int i = 0; i < nx; i++)
h_idata[j*nx + i] = j*nx + i;
// device
checkCuda(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) );
// events for timing
cudaEvent_t startEvent, stopEvent;
checkCuda(cudaEventCreate(&startEvent) );
checkCuda(cudaEventCreate(&stopEvent) );
float ms;
checkCuda( cudaEventRecord(startEvent, 0) );
transpose_naive<<<dimGrid, dimBlock>>>(d_cdata, d_idata,nx,nx);
checkCuda(cudaEventRecord(stopEvent, 0) );
checkCuda(cudaEventSynchronize(stopEvent) );
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent) );
checkCuda( cudaMemcpy(h_cdata, d_cdata, mem_size, cudaMemcpyDeviceToHost) );
printf("the time %5f ", ms);
printf("\n");
savetofile(h_idata,"i.txt",nx,nx);
savetofile(h_cdata,"t.txt",nx,nx);
error_exit:
// cleanup
checkCuda(cudaEventDestroy(startEvent) );
checkCuda(cudaEventDestroy(stopEvent) );
checkCuda( cudaFree(d_cdata) );
checkCuda( cudaFree(d_idata) );
free(h_idata);
free(h_cdata);
system("pause");
}
答案 0 :(得分:2)
我认为文件输出“i.txt”和“t.txt”有问题,否则程序看起来是正确的。我通过在标准输出流上添加错误检查和打印,对代码进行了一些小的更改。我正在打印最后一个(1020 - 1024)3 x 3矩阵以交叉检查转置。在您的系统上运行它并验证矩阵转置是否正确?
#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
#include "device_launch_parameters.h"
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file, line);
if (abort) exit(code);
}
}
__global__ void transpose_naive(float *out, float *in, int w, int h)
{
unsigned int xIdx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIdx = blockDim.y * blockIdx.y + threadIdx.y;
if (xIdx <= w && yIdx <= h) {
unsigned int idx_in = xIdx + w * yIdx;
unsigned int idx_out = yIdx + h * xIdx;
out[idx_out] = in[idx_in];
}
}
int main()
{
int nx = 1024;
int mem_size = nx*nx*sizeof(float);
int t = 32;
dim3 dimGrid(((nx - 1) / t) + 1, (((nx - 1) / t) + 1));
dim3 dimBlock(t, t);
float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *d_idata, *d_cdata;
gpuErrchk(cudaMalloc(&d_idata, mem_size));
gpuErrchk(cudaMalloc(&d_cdata, mem_size));
// host
for (int j = 0; j < nx; j++)
for (int i = 0; i < nx; i++)
h_idata[j*nx + i] = j*nx + i;
// device
gpuErrchk(cudaMemcpy(d_idata,h_idata,mem_size,cudaMemcpyHostToDevice));
// events for timing
cudaEvent_t startEvent, stopEvent;
gpuErrchk(cudaEventCreate(&startEvent));
gpuErrchk(cudaEventCreate(&stopEvent));
float ms;
gpuErrchk(cudaEventRecord(startEvent, 0));
transpose_naive << <dimGrid, dimBlock >> >(d_cdata, d_idata, nx, nx);
gpuErrchk(cudaEventRecord(stopEvent, 0));
gpuErrchk(cudaEventSynchronize(stopEvent));
gpuErrchk(cudaEventElapsedTime(&ms, startEvent, stopEvent));
gpuErrchk(cudaMemcpy(h_cdata,d_cdata,mem_size,cudaMemcpyDeviceToHost));
printf("the time %5f ", ms);
printf("\n");
for (int i = 1020; i < 1024; i++) {
for (int j = 1020; j < 1024; j++) {
printf("%.2f ", h_idata[i*nx + j]);
}
printf("\n");
}
printf("\n");
for (int i = 1020; i < 1024; i++) {
for (int j = 1020; j < 1024; j++) {
printf("%.2f ", h_cdata[i*nx + j]);
}
printf("\n");
}
//savetofile(h_idata, "i.txt", nx, nx);
//savetofile(h_cdata, "t.txt", nx, nx);
//error_exit:
// cleanup
gpuErrchk(cudaEventDestroy(startEvent));
gpuErrchk(cudaEventDestroy(stopEvent));
gpuErrchk(cudaFree(d_cdata));
gpuErrchk(cudaFree(d_idata));
free(h_idata);
free(h_cdata);
//system("pause");
}
答案 1 :(得分:1)
代码中唯一的缺陷是内核的以下行中的错误绑定检查。
if ( xIdx <=w && yIdx <=h ) {
由于x和y维度的索引分别从0
到w-1
和0
到h-1
,因此if
条件应如下所示:
if ( xIdx <w && yIdx <h ) {