首先,我很抱歉,如果我的语法很糟糕, 我在处理不同维度(160x320)的2D阵列时遇到问题。
dim3 blocks(DIMX/16,DIMZ/32);
dim3 threads(16,16);
这段代码编译得很好,但不知何故只处理了160x160,遗留数组仍然为零。我做错了吗?
#include "cuda.h"
#include "conio.h"
#include <fstream>
#include <sstream>
#include <iostream>
#include <assert.h>
#include "../common/book.h"
#define DIMX 160
#define DIMZ 320
#define PI 3.1415926535897932f
#define dx 1.0
#define dz 1.0
#define dt 0.001
#define samp 500
#define nite 1000
__global__ void txz_kernel(float *txz,float *vz)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
vz[offset]=txz[offset]+vz[offset];
}
int main( void )
{
float *txz;
float *vz;
HANDLE_ERROR( cudaMalloc( (void**)&txz, DIMX * DIMZ * sizeof(float)));
HANDLE_ERROR( cudaMalloc( (void**)&vz, DIMX * DIMZ * sizeof(float)));
float *tempvz = (float*)malloc( sizeof(float)*(DIMX*DIMZ));
float *temptxz = (float*)malloc( sizeof(float)*(DIMX*DIMZ));
for (int i=0; i<DIMX; i++) {
for (int j=0; j<DIMZ; j++) {
int ij=DIMX*j + i;
tempvz[ij]=0.0;
temptxz[ij]=100.0;
}
}
for (int i=0; i<DIMX; i++) {
for (int j=(121); j<DIMZ; j++) {
int ij=DIMX*j + i;
tempvz[ij]=0.0;
temptxz[ij]=150.0;
}
}
HANDLE_ERROR( cudaMemcpy( vz, tempvz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaMemcpy( txz, temptxz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice ) );
dim3 blocks(DIMX/16,DIMZ/32);
dim3 threads(16,16);
txz_kernel<<<blocks,threads>>>(txz,vz) ;
float *tempse = (float*)malloc( sizeof(float)*(DIMX*DIMZ));
HANDLE_ERROR( cudaMemcpy( tempse, vz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyDeviceToHost ) );
std::ofstream outseis("contour.ctxt"); // output, normal file
for (int jj=0; jj<DIMZ; jj++)
{
for (int ii=0; ii<DIMX; ii++)
{
int ij=DIMX*jj + ii;
outseis<<tempse[ij]<<" ";
}
outseis<<"\r\n";
}
}
答案 0 :(得分:1)
此行阻止(DIMX / 16,DIMZ / 32);应该是块(DIMX / 16,DIMZ / 16);如果一切都很好......
同时检查索引应该是
int ij=DIMZ*i + j;
如果您正在连续处理主要订单。如果您正在按照列主要顺序处理您所写的内容是正确的。
以下是ur代码的略微修改版本,它正在编译并给出正确的结果,即添加2个数组并找到总和给我102400(160 * 320 + 160 * 320)
把你自己的号码和检查..
注意:这适用于行主要订单。
#include "cuda.h"
#include <fstream>
#include <sstream>
#include <iostream>
#include <assert.h>
#define DIMX 160
#define DIMZ 320
#define PI 3.1415926535897932f
#define dx 1.0
#define dz 1.0
#define dt 0.001
#define samp 500
#define nite 1000
__global__ void txz_kernel(float *txz,float *vz)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = (x * blockDim.x * gridDim.x) + y ;
if (offset < (DIMX * DIMZ))
{
vz[offset]=txz[offset]+vz[offset];
}
else
{
printf ("Offset going out of the bounds\n") ;
}
}
int main( void )
{
float *txz;
float *vz;
float sum = 0.0 ;
float *tempse ;
HANDLE_ERROR( cudaMalloc( (void**)&txz, DIMX * DIMZ * sizeof(float)));
HANDLE_ERROR( cudaMalloc( (void**)&vz, DIMX * DIMZ * sizeof(float)));
float *tempvz = (float*)malloc( sizeof(float)*(DIMX*DIMZ));
float *temptxz = (float*)malloc( sizeof(float)*(DIMX*DIMZ));
for (int i=0; i<DIMX; i++) {
for (int j=0; j<DIMZ; j++) {
int ij=DIMZ*i + j;
tempvz[ij]=1.0;
temptxz[ij]=1.0;
}
}
cudaMemcpy( txz, temptxz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice ) ;
cudaMemcpy( vz, tempvz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice ) ;
dim3 blocks(DIMX/16,DIMZ/16);
dim3 threads(16,16);
txz_kernel<<<blocks,threads>>>(txz,vz) ;
//cudaDeviceSynchronize() ;
tempse = (float*)malloc( sizeof(float)*(DIMX*DIMZ));
HANDLE_ERROR( cudaMemcpy( tempse, vz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyDeviceToHost ) );
for (int jj=0; jj<DIMX; jj++)
{
for (int ii=0; ii<DIMZ; ii++)
{
int ij=DIMZ*jj + ii;
sum += tempse[ij] ;
}
}
printf ("The sum is %f\n", sum) ;
}