(对不起我的英语)我研究CUDA。我在C CUDA中有关于热2D的代码,但是当我编译代码时会出现错误(不是语法错误)。此错误是"违规段"。我认为这是一个记忆容量,但我不确定,我不知道该怎么做。请帮帮我。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#include <getopt.h>
#include <cuda.h>
#define MAXSTEP 1000
#define CX 0.001
#define CY 0.001
#define NTHREADS 32
void setupBoundaryConditions (double** X, unsigned long int sizex, unsigned long int sizey);
void initializeArray (double** X, unsigned long int sizex, unsigned long int sizey);
double** make2DDoubleArray (unsigned long int sizex, unsigned long int sizey);
void free2DDoubleArray (double **X, unsigned long int size);
void save (double** X, unsigned long int x, unsigned long int y, char* filename);
double gettime(void);
/*
* subroutine update - CUDA implementation
*/
__global__ void update(unsigned long int NXPROB, unsigned long int NYPROB, double *X1, double *X2)
{
long int i, j;
long int CENTRE, NORD, SUD, EST, OEST;
i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;
CENTRE = i + j*NXPROB; //(i,j) - CENTER
NORD = i + (j+1)*NXPROB; //(i,j+1) - N
SUD = i + (j-1)*NXPROB; //(i,j-1) - S
EST = (i+1) + j*NXPROB; //(i+1,j) - E
OEST = (i-1) + j*NXPROB; //(i-1,j) - W
//ALTERNATIVE0: THIS PART HAS AN ERROR
//ERROR: expression must have pointer-to-object type
// --> it's refering to X2 and X1
/*if(i>0 && i<NXPROB && j>0 && j<NYPROB)
{
X2[i][j] = X1[i][j]
+ CX * ( X1[i+1][j] + X1[i-1][j] - 2.0 * X1[i][j] )
+ CY * ( X1[i][j+1] + X1[i][j-1] - 2.0 * X1[i][j] );
}*/
//ALTERNATIVE 1:
if(i>0 && i<NXPROB && j>0 && j<NYPROB)
{
X2[CENTRE] = X1[CENTRE]
+ CX * (X1[EST] + X1[OEST] - 2.0 * X1[CENTRE])
+ CY * (X1[NORD] + X1[SUD] - 2.0 * X1[CENTRE]);
}
__syncthreads();
}
/*
* Main
*/
int main(int argc, char* argv[]) {
long int s, it;
unsigned int flag, verbose;
unsigned int NX, NY, NXPROB, NYPROB;
double start, end;
int iz;
// Defaut values
NX = 100;
NY = 100;
// create file and verbose flags
flag = 0;
verbose = 0;
// Parse command line options
int opt;
char *file = NULL;
while ((opt = getopt(argc, argv, "hvs:f:")) != -1) {
switch (opt) {
case 'v':
verbose = 1;
break;
case 's':
if( !(s=atoi(optarg)) ) {
fprintf(stderr, "Cannot parse %s value.\n", optarg);
exit(EXIT_FAILURE);
}
NX = NY = s;
break;
case 'f':
file = optarg;
flag = 1;
break;
case 'h':
default:
fprintf(stderr, "Usage: %s [-s SIZE] [-f output file]\n", argv[0]);
exit(EXIT_FAILURE);
}
}
// Set initial data values
NXPROB = NX - 1;
NYPROB = NY - 1;
if(verbose) {
fprintf(stdout, "[INFO] Setting map size to %d (%dx%d)\n", NX*NY, NX, NY);
fprintf(stdout, "[INFO] Max iter %d\n", MAXSTEP);
}
if(verbose && flag) {
fprintf(stdout, "[INFO] Using output file %s\n", file);
}
// Program starts here
start = gettime();
// CPU Memory allocation
double** X[2];
X[0] = make2DDoubleArray (NX, NY);
X[1] = make2DDoubleArray (NX, NY);
// Set initial and boundary conditions
initializeArray (X[0], NX, NY);
setupBoundaryConditions(X[0], NX, NY);
setupBoundaryConditions(X[1], NX, NY);
// GPU Memory allocation
double *d_X1, *d_X2;
cudaMalloc((void **)&d_X1, NX*NY*sizeof(double));
cudaMalloc((void **)&d_X2, NX*NY*sizeof(double));
// Copy CPU --> GPU
cudaMemcpy(d_X1, X, NX*NY*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_X2, X, NX*NY*sizeof(double), cudaMemcpyHostToDevice);
dim3 dimBlock(NTHREADS,NTHREADS);
dim3 dimGrid(1,1);
// Main calculations
iz = 0;
for (it = 0; it < MAXSTEP; it++)
{
if(verbose && (it%(MAXSTEP/10) == 0)) {
fprintf(stdout, "[INFO] iteration %ld, time %.3f seconds\n", it, gettime()-start);
}
// The first update has an error: d_X1 and d_X2
//update<<<dimGrid, dimBlock>>>(NXPROB, NYPROB, d_X1[iz], d_X2[1-iz]);
update<<<dimGrid, dimBlock>>>(NXPROB, NYPROB, d_X1, d_X2);
iz = 1 - iz;
}
// Copy GPU --> CPU
cudaMemcpy(X, d_X1, NX*NY*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(X, d_X2, NX*NY*sizeof(double), cudaMemcpyDeviceToHost);
//cudaThreadSynchronize();
// Save output file
if(flag) save(X[iz], NX, NY, file);
free2DDoubleArray(X[0], NX);
free2DDoubleArray(X[1], NX);
// End time
end = gettime();
// Get information: wall clock time, problem size, ...
if(verbose)
{
fprintf(stdout, "[INFO] Convergence after %d steps\n", MAXSTEP);
fprintf(stdout, "[INFO] Problem size %d [%dx%d]\n", NY*NX, NX, NY);
fprintf(stdout, "[INFO] Wall clock time %lf seconds\n",(end-start));
if(flag) fprintf(stdout, "[INFO] Output file %s\n", file);
}
else
{
printf("Time %.3f seconds, Size %d [%dx%d]\n", end - start, NY*NX, NX, NY);
}
cudaFree(d_X1);
cudaFree(d_X2);
exit(EXIT_SUCCESS);
}
void setupBoundaryConditions(double** X, unsigned long int x, unsigned long int y) {
/* set boundary conditions for ix, jy = 0 and ix, jy = n-1 */
unsigned long int i, j;
double leftBC = 0,
rightBC = 0,
topBC = 0,
bottomBC = 0;
/* setup the bottom and top BCs, jy = 0 and jy = n-1 or arraySizeY - 1 */
for (i = 0; i < x; i++)
{
X[i][0] = bottomBC; //bottom BC
X[i][y-1] = topBC; //top BC
}
/* setup the left and right BCs, ix = 0 and ix = arraySizeX - 1 */
for (j = 0; j < y; j++)
{
X[0][j] = leftBC; //left BC
X[x-1][j] = rightBC; //right BC
}
/* set the values at the corner nodes as averages of both sides*/
// bottom left
X[0][0] = 0.5 * (leftBC + bottomBC);
// top left
X[0][y-1] = 0.5 * (topBC + leftBC);
// top right
X[x-1][y-1] = 0.5 * (topBC + rightBC);
// bottom right
X[x-1][0] = 0.5 * (bottomBC + rightBC);
}
void initializeArray(double** X, unsigned long int x, unsigned long int y) {
unsigned long int i, j;
for (i = 1; i < x; i++)
{
for (j = 1; j < y; j++)
{
X[i][j] = (double)MAXSTEP+(i * (x - i - 1) * j * (y - j - 1));
}
}
for (i = 0; i < x; i++)
{
X[i][0] = 0; //bottom BC
X[i][y-1] = 0; //top BC
}
for (j = 1; j < y; j++)
{
X[0][j] = 0; //left BC
X[x-1][j] = 0; //right BC
}
}
void free2DDoubleArray(double **X, unsigned long int size) {
unsigned long int i;
for (i = 0; i < size; ++i) {
free(X[i]);
}
free(X);
}
double** make2DDoubleArray(unsigned long int x, unsigned long int y) {
unsigned long int ix;
double** X;
X = (double**) malloc(x*sizeof(double*));
for (ix = 0; ix < x; ix++) {
X[ix] =(double*) malloc(y*sizeof(double));
}
return X;
}
void save(double** X, unsigned long int x, unsigned long int y, char* filename) {
unsigned long int i, j;
FILE* file;
file = fopen(filename,"w");
for (i = 0; i < x; i++)
{
for (j = 0; j < y; j++)
{
fprintf(file,"%8.3f ", X[i][j]);
}
fprintf(file,"\n");
}
fclose(file);
}
/* Timing function */
double gettime(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return tv.tv_sec + 1e-6*tv.tv_usec;
}
答案 0 :(得分:-1)
内存容量错误是在设备中创建两个矩阵/阵列。这部分:
// GPU Memory allocation
double *d_X1, *d_X2;
cudaMalloc((void **)&d_X1, NX*NY*sizeof(double));
cudaMalloc((void **)&d_X2, NX*NY*sizeof(double));
解决方案:
// GPU Memory allocation
double *d_X[2];
cudaMalloc((void **)&d_X[0], NX*NY*sizeof(double) );
cudaMalloc((void **)&d_X[1], NX*NY*sizeof(double) );
完整代码:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#include <getopt.h>
#include <cuda.h>
#define MAXSTEP 1000
#define CX 0.001
#define CY 0.001
#define NTHREADS 32
void setupBoundaryConditions (double** X, unsigned long int sizex, unsigned long int sizey);
void initializeArray (double** X, unsigned long int sizex, unsigned long int sizey);
double** make2DDoubleArray (unsigned long int sizex, unsigned long int sizey);
void save (double** X, unsigned long int x, unsigned long int y, char* filename);
double gettime(void);
/*
* subroutine update - CUDA implementation
*/
__global__ void update(bool shared_memory, unsigned int NX, unsigned long int NXPROB, unsigned long int NYPROB, double *d_X1, double *d_X2)
{
//WITHOUT SHARED MEMORY - CODE THAT USES
if(!shared_memory)
{
long int i;
long int CENTRE, NORD, SUD, EST, OEST;
i = blockIdx.x * blockDim.x + threadIdx.x;
CENTRE = i; //(i,j)
NORD = i + (NXPROB+1); //(i,j+1)
SUD = i + (NXPROB-1); //(i,j-1)
EST = i + 1; //(i+1,j)
OEST = i - 1; //(i-1,j)
if(i>NYPROB && i<NXPROB*NYPROB)
{
d_X2[CENTRE] = d_X1[CENTRE]
+ CX * (d_X1[EST] + d_X1[OEST] - 2.0 * d_X1[CENTRE])
+ CY * (d_X1[NORD] + d_X1[SUD] - 2.0 * d_X1[CENTRE]);
}
}
/*
* If you want execute this part,
* then you put true to "shared_memory" in main
*/
//WITH SHARED MEMORY - SUBOPTIMAL (NOT MORE THAN SECUENCE CODE)
else
{
long int i, j, tx, ty, i2d;
__shared__ double temp[NTHREADS][NTHREADS];
double part_x, part_y;
tx = threadIdx.x;
ty = threadIdx.y;
i = blockIdx.x * blockDim.x + tx;
j = blockIdx.y * blockDim.y + ty;
i2d = i + NXPROB*j;
if(i2d < NXPROB*NYPROB)
temp[tx][ty] = d_X1[i2d];
__syncthreads();
if(i>NXPROB && i<NXPROB*NYPROB && j>NYPROB && j<NXPROB*NYPROB)
{
if((tx>0) && (tx < NTHREADS-1) && tx < NTHREADS-1){
part_x = (temp[tx+1][ty] - 2*temp[tx][ty] + temp[tx-1][ty]);
part_y = (temp[tx][ty+1] - 2*temp[tx][ty] + temp[tx][ty-1]);
}
d_X2[i2d] = d_X1[i2d] + CX*CY*(part_x + part_y);
}
}
}
/*
* Main
*/
int main(int argc, char* argv[]) {
long int s, it;
unsigned int flag, verbose;
unsigned int NX, NY, NXPROB, NYPROB;
double start, end;
int iz;
dim3 dimBlock, dimGrid;
bool shared_memory = false; //FALSE = no shared memory - TRUE = shared memory
// Defaut values
NX = 100;
NY = 100;
// create file and verbose flags
flag = 0;
verbose = 0;
// Parse command line options
int opt;
char *file = NULL;
while ((opt = getopt(argc, argv, "hvs:f:")) != -1) {
switch (opt) {
case 'v':
verbose = 1;
break;
case 's':
if( !(s=atoi(optarg)) ) {
fprintf(stderr, "Cannot parse %s value.\n", optarg);
exit(EXIT_FAILURE);
}
NX = NY = s;
break;
case 'f':
file = optarg;
flag = 1;
break;
case 'h':
default:
fprintf(stderr, "Usage: %s [-s SIZE] [-f output file]\n", argv[0]);
exit(EXIT_FAILURE);
}
}
// Set initial data values
NXPROB = NX - 1;
NYPROB = NY - 1;
if(verbose) {
fprintf(stdout, "[INFO] Setting map size to %d (%dx%d)\n", NX*NY, NX, NY);
fprintf(stdout, "[INFO] Max iter %d\n", MAXSTEP);
}
if(verbose && flag) {
fprintf(stdout, "[INFO] Using output file %s\n", file);
}
// Program starts here
start = gettime();
// CPU Memory allocation
double** X[2];
X[0] = make2DDoubleArray (NX, NY);
X[1] = make2DDoubleArray (NX, NY);
// Set initial and boundary conditions
initializeArray (X[0], NX, NY);
setupBoundaryConditions(X[0], NX, NY);
setupBoundaryConditions(X[1], NX, NY);
// GPU Memory allocation
double *d_X[2];
cudaMalloc((void **)&d_X[0], NX*NY*sizeof(double) );
cudaMalloc((void **)&d_X[1], NX*NY*sizeof(double) );
// Copy CPU --> GPU
cudaMemcpy(d_X[0], X[0], NX*NY*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_X[1], X[1], NX*NY*sizeof(double), cudaMemcpyHostToDevice);
dimBlock = dim3(NTHREADS,NTHREADS);
dimGrid = dim3(ceil(NX/dimBlock.x),ceil(NY/dimBlock.y));
// Main calculations
iz = 0;
for (it = 0; it < MAXSTEP; it++)
{
if(verbose && (it%(MAXSTEP/10) == 0)) {
fprintf(stdout, "[INFO] iteration %ld, time %.3f seconds\n", it, gettime()-start);
}
update<<<dimGrid, dimBlock>>>(shared_memory, NX, NXPROB, NYPROB, d_X[iz], d_X[1-iz]);
iz = 1 - iz;
}
cudaThreadSynchronize();
// Copy GPU --> CPU
//cudaMemcpy(X[iz], d_X[iz], NX*NY*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(X, d_X, NX*NY*sizeof(double), cudaMemcpyDeviceToHost);
// Save output file
if(flag) save(X[iz], NX, NY, file);
// End time
end = gettime();
// Get information: wall clock time, problem size, ...
if(verbose)
{
fprintf(stdout, "[INFO] Convergence after %d steps\n", MAXSTEP);
fprintf(stdout, "[INFO] Problem size %d [%dx%d]\n", NY*NX, NX, NY);
fprintf(stdout, "[INFO] Wall clock time %lf seconds\n",(end-start));
if(flag) fprintf(stdout, "[INFO] Output file %s\n", file);
}
else
{
printf("Time %.3f seconds, Size %d [%dx%d]\n", end - start, NY*NX, NX, NY);
}
cudaFree(d_X);
exit(EXIT_SUCCESS);
}
void setupBoundaryConditions(double** X, unsigned long int x, unsigned long int y) {
/* set boundary conditions for ix, jy = 0 and ix, jy = n-1 */
unsigned long int i, j;
double leftBC = 0,
rightBC = 0,
topBC = 0,
bottomBC = 0;
/* setup the bottom and top BCs, jy = 0 and jy = n-1 or arraySizeY - 1 */
for (i = 0; i < x; i++)
{
X[i][0] = bottomBC; //bottom BC
X[i][y-1] = topBC; //top BC
}
/* setup the left and right BCs, ix = 0 and ix = arraySizeX - 1 */
for (j = 0; j < y; j++)
{
X[0][j] = leftBC; //left BC
X[x-1][j] = rightBC; //right BC
}
/* set the values at the corner nodes as averages of both sides*/
// bottom left
X[0][0] = 0.5 * (leftBC + bottomBC);
// top left
X[0][y-1] = 0.5 * (topBC + leftBC);
// top right
X[x-1][y-1] = 0.5 * (topBC + rightBC);
// bottom right
X[x-1][0] = 0.5 * (bottomBC + rightBC);
}
void initializeArray(double** X, unsigned long int x, unsigned long int y) {
unsigned long int i, j;
for (i = 1; i < x; i++)
{
for (j = 1; j < y; j++)
{
X[i][j] = (double)MAXSTEP+(i * (x - i - 1) * j * (y - j - 1));
}
}
for (i = 0; i < x; i++)
{
X[i][0] = 0; //bottom BC
X[i][y-1] = 0; //top BC
}
for (j = 1; j < y; j++)
{
X[0][j] = 0; //left BC
X[x-1][j] = 0; //right BC
}
}
double** make2DDoubleArray(unsigned long int x, unsigned long int y) {
unsigned long int ix;
double** X;
X = (double**) malloc(x*sizeof(double*));
for (ix = 0; ix < x; ix++) {
X[ix] =(double*) malloc(y*sizeof(double));
}
return X;
}
void save(double** X, unsigned long int x, unsigned long int y, char* filename) {
unsigned long int i, j;
FILE* file;
file = fopen(filename,"w");
for (i = 0; i < x; i++)
{
for (j = 0; j < y; j++)
{
fprintf(file,"%8.3f ", X[i][j]);
}
fprintf(file,"\n");
}
fclose(file);
}
/* Timing function */
double gettime(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return tv.tv_sec + 1e-6*tv.tv_usec;
}
虽然结果可能会有一些错误。