我有一个cuda应用程序,我试图从主机内存复制到常量内存。复制正在进行而没有任何错误。但是我没有得到需要复制的值,而且在调试程序时,我总是在常量内存中得到0值。我的代码是这样的:
此变量在单独的header.h文件中定义
#include <windows.h>
#include <dos.h>
#include <stdio.h>
#include <conio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <curand.h>
#include <curand_kernel.h>
#define env_end 48
__constant__ float dev_h_top[2*env_end];
__constant__ float dev_h_bot[2*env_end];
__constant__ int dev_row_top[8];
__constant__ int dev_col_top[8];
__constant__ int dev_row_bot[8];
__constant__ int dev_col_bot[8];
void INIT_AG_PLACEMENT_FUNC(int *,int ,int ,int,double *,double *,int,int *,int *,int *, int *);
__global__ void AGENT_POSITION_FUNC(int *,double *,double *,int *,int *,int *,int *,int *,int *,double *, double *);
int main(int argc,char *argv[])
{
int i,j,t,k,test,iter;
int *mat, *mat_ind_top,*mat_ind_bot;
int rows,del_t,del_b;
int *top_ag_prop,*bot_ag_prop;
float init_p_val;
double *p_top,*p_bot;
double *top_tour_len,*bot_tour_len;
float *h_mat_top,*h_mat_bot;
int row_top[8] = {1,1,1,0,0,-1,-1,-1} ,col_top[8] = {-1,0,1,-1,1,-1,0,1},row_bot[8] = {-1,-1,-1,0,0,1,1,1},col_bot[8]={-1,0,1,-1,1,-1,0,1};
//GPU variables
int *dev_mat,*dev_top_ag_ind_mat,*dev_bot_ag_ind_mat;
int *dev_top_ag_prop,*dev_bot_ag_prop;
int *dev_top_ag_srd,*dev_bot_ag_srd;
double *dev_top_ag_prob, *dev_bot_ag_prob;
double *dev_p_top,*dev_p_bot;
//Random Variables
curandState *state_t,*state_b;
cudaError_t status,error ;
iter = 2;
iter = 2;
rows = 16;
del_t = 320;
del_b = 320;
init_p_val = 200.0;
//Main matrix
mat = (int *)malloc(env_end*env_end*sizeof(int));
memset(mat,0,env_end*env_end*sizeof(int));
mat_ind_top = (int *)malloc(env_end*env_end*sizeof(int));
memset(mat_ind_top ,0,env_end*env_end*sizeof(int));
mat_ind_bot = (int *)malloc(env_end*env_end*sizeof(int));
memset(mat_ind_bot,0,env_end*env_end*sizeof(int));
//Top and bottom phermone matrix
p_top = (double *)malloc(env_end*env_end*sizeof(double));
p_bot = (double*)malloc(env_end*env_end*sizeof(double));
//Top agents properties matrix memory allocation & memset
top_ag_prop = (int *)malloc(8*((rows*env_end)-del_t)*sizeof(int));
memset(top_ag_prop,0,8*((rows*env_end)-del_t)*sizeof(int));
//Top agents tour length matrix allocation
top_tour_len = (double *)malloc(((rows*env_end)-del_t)*sizeof(double));
memset(top_tour_len,0,((rows*env_end)-del_t)*sizeof(double));
//Bottom agents properties matrix memory allocation & memset
bot_ag_prop = (int *)malloc(8*((rows*env_end)-del_b)*sizeof(int));
memset(bot_ag_prop,0,8*((rows*env_end)-del_b)*sizeof(int));
//Bottom agents tour length matrix allocation and memset
bot_tour_len = (double *)malloc(((rows*env_end)-del_b)*sizeof(double));
memset(bot_tour_len,0,((rows*env_end)-del_b)*sizeof(double));
INIT_AG_PLACEMENT_FUNC(mat, rows, del_t, del_b, p_top, p_bot, init_p_val,
top_ag_prop, bot_ag_prop, mat_ind_top, mat_ind_bot);
//Heuristics Matrix Memory allocation
h_mat_top = (float *)malloc(2*env_end*sizeof(float));
memset(h_mat_top,0,2*env_end*sizeof(float));
h_mat_bot = (float *)malloc(2*env_end*sizeof(float));
memset(h_mat_bot,0,2*env_end*sizeof(float));
for (i=0;i<env_end;i++)
{ h_mat_top[i*2] = sqrt(double(((env_end-i)*(env_end-i)) +1)); // This stores the distance of the agents placed in the top
h_mat_top[i*2+1] = (env_end-i); // to the end of the environment target
printf("%f\t%f\n",h_mat_top[i*2] ,h_mat_top[i*2+1] );
h_mat_bot[i*2] = sqrt(double((i*i)+1)) ; // This stores the distance of the agents placed in the bottom
h_mat_bot[i*2+1] = double(i); // to the end of the environment target
}
//GPU and CPU both variables
//Device main matrix allocation and memory copy
cudaMalloc((void **)&dev_mat,env_end*env_end*sizeof(int));
cudaMemcpy(dev_mat,mat,env_end*env_end*sizeof(int),cudaMemcpyHostToDevice);
//Device Top Agents index matrix variable memory allocation and copy
cudaMalloc((void **)&dev_top_ag_ind_mat,env_end*env_end*sizeof(int));
cudaMemcpy(dev_top_ag_ind_mat,mat_ind_top,env_end*env_end*sizeof(int),cudaMemcpyHostToDevice);
//Device Bottom Agents index matrix variable memory allocation and copy
cudaMalloc((void **)&dev_bot_ag_ind_mat,env_end*env_end*sizeof(int));
cudaMemcpy(dev_bot_ag_ind_mat,mat_ind_bot,env_end*env_end*sizeof(int),cudaMemcpyHostToDevice);
//Device top phermone matrix allocation and memory copy
cudaMalloc((void **)&dev_p_top,env_end*env_end*sizeof(double));
cudaMemcpy(dev_p_top,p_top,env_end*env_end*sizeof(double),cudaMemcpyHostToDevice);
//Device bottom phermone matrix allocation and memory copy
cudaMalloc((void **)&dev_p_bot,env_end*env_end*sizeof(double));
cudaMemcpy(dev_p_bot,p_bot,env_end*env_end*sizeof(double),cudaMemcpyHostToDevice);
//Device Top agents properties memory allocation and memory contents copy
cudaMalloc((void **)&dev_top_ag_prop,8*((rows*env_end)-del_t)*sizeof(int));
cudaMemcpy(dev_top_ag_prop,top_ag_prop,8*((rows*env_end)-del_t)*sizeof(int),cudaMemcpyHostToDevice);
//Device Bottom agents properties memory allocation and memory contents copy
cudaMalloc((void **)&dev_bot_ag_prop,8*((rows*env_end)-del_b)*sizeof(int));
cudaMemcpy(dev_bot_ag_prop,bot_ag_prop,8*((rows*env_end)-del_b)*sizeof(int),cudaMemcpyHostToDevice);
//GPU only variables
//Device Top agents surrounding cells matrix memory allocation and memset
cudaMalloc((void **)&dev_top_ag_srd,8*((rows*env_end)-del_t)*sizeof(int));
cudaMemset(dev_top_ag_srd,0,8*((rows*env_end)-del_t)*sizeof(int));
//Device Bottom agents surrounding cells matrix memory allocation and memset
cudaMalloc((void **)&dev_bot_ag_srd,8*((rows*env_end)-del_b)*sizeof(int));
cudaMemset(dev_bot_ag_srd,0,8*((rows*env_end)-del_b)*sizeof(int));
//Device Top agents probability matrix memory allocation and memset
cudaMalloc((void **)&dev_top_ag_prob,8*((rows*env_end)-del_t)*sizeof(double));
cudaMemset(dev_top_ag_prob,0,8*((rows*env_end)-del_t)*sizeof(double));
//Device Bottom agents probability matrix memory allocation and memset
cudaMalloc((void **)&dev_bot_ag_prob,8*((rows*env_end)-del_b)*sizeof(double));
cudaMemset(dev_bot_ag_prob,0,8*((rows*env_end)-del_b)*sizeof(double));
//Device random number seed memory allocation for top and bottom agents
cudaMalloc((void **)&state_t,8*((rows*env_end)-del_t)*sizeof(curandState));
cudaMalloc((void **)&state_b,8*((rows*env_end)-del_b)*sizeof(curandState));
status = cudaMemcpyToSymbol(dev_h_top,h_mat_top,2*env_end*sizeof(float));
if (status!=cudaSuccess)
{ printf("Error in allocating constant memory!!");
}
status = cudaMemcpyToSymbol(dev_h_bot,h_mat_bot,2*env_end*sizeof(float));//,cudaMemcpyHostToDevice);
if (status!=cudaSuccess)
{ printf("Error in allocating constant memory!!");
}
status = cudaMemcpyToSymbol(dev_row_top,row_top,8*sizeof(int));
if (status!=cudaSuccess)
{ printf("Error in allocating constant memory!!");
}
status = cudaMemcpyToSymbol(dev_col_top,col_top,8*sizeof(int));
if (status!=cudaSuccess)
{ printf("Error in allocating constant memory!!");
}
status = cudaMemcpyToSymbol(dev_row_bot,row_bot,8*sizeof(int));
if (status!=cudaSuccess)
{ printf("Error in allocating constant memory!!");
}
status = cudaMemcpyToSymbol(dev_col_bot,col_bot,8*sizeof(int));
if (status!=cudaSuccess)
{ printf("Error in allocating constant memory!!");
}
//system("PAUSE");
dim3 gridDim_1(env_end/16,env_end/16,1);
dim3 blockDim_1(16,16,1);
AGENT_POSITION_FUNC<<<gridDim_1,blockDim_1>>>(dev_mat, dev_p_top, dev_p_bot, dev_top_ag_prop,dev_bot_ag_prop, dev_top_ag_srd, dev_bot_ag_srd,
dev_top_ag_ind_mat, dev_bot_ag_ind_mat,dev_top_ag_prob, dev_bot_ag_prob);
cudaDeviceSynchronize();
error = cudaGetLastError();
if(error != cudaSuccess)
{printf("CUDA Error: %s\n", cudaGetErrorString(error));
}
cudaFree(dev_mat);
cudaFree(dev_top_ag_ind_mat);
cudaFree(dev_bot_ag_ind_mat);
cudaFree(dev_p_top);
cudaFree(dev_p_bot);
cudaFree(dev_top_ag_prop);
cudaFree(dev_bot_ag_prop);
cudaFree(dev_top_ag_srd);
cudaFree(dev_bot_ag_srd);
cudaFree(dev_top_ag_prob);
cudaFree(dev_bot_ag_prob);
cudaFree(state_t);
cudaFree(state_b);
free(top_ag_prop);
free(bot_ag_prop);
free(h_mat_top);
free(h_mat_bot);
free(top_tour_len);
free(bot_tour_len);
free(p_top);
free(p_bot);
free(mat_ind_top);
free(mat_ind_bot);
free(mat);
}
void INIT_AG_PLACEMENT_FUNC(int *mat,int rows,int del_t,int del_b,double *p_top,double *p_bot,int init_p_val,
int *top_ag_prop,int *bot_ag_prop, int *mat_ind_top, int *mat_ind_bot)
{
int i,j,t,k,t_r_rand,t_c_rand,b_r_rand,b_c_rand;
for (i=0;i<rows;i++)
{ for (j=0;j<env_end;j++)
{ mat[i*env_end+j]=1;
}
}
for (i=env_end-rows;i<env_end;i++)
{ for (j=0;j<env_end;j++)
{ mat[i*env_end+j]=2;
}
}
srand(time(NULL));
for (i=0;i<del_t;i++)
{ t_r_rand = rand()%(rows +1);
t_c_rand = rand()%env_end;
if (mat[t_r_rand*env_end+t_c_rand]==0)
{ i=i-1;
}
else
{ mat[t_r_rand*env_end+t_c_rand] = 0;
}
}
srand(time(NULL));
for (i=0;i<del_b;i++)
{ b_r_rand = rand()%(env_end-(env_end - rows))+(env_end - rows);
b_c_rand = rand()%env_end;
if (mat[b_r_rand*env_end+b_c_rand]==0)
{ i=i-1;
}
else
{ mat[b_r_rand*env_end+b_c_rand] = 0;
}
}
t=0,k=0;
for (i=0;i<env_end;i++)
{ for (j=0;j<env_end;j++)
{ //id | index number | row | col | target col | future_row | future_col | empty cell
if (mat[i*env_end+j] == 1)
{ top_ag_prop[t*8] = 1; top_ag_prop[t*8+1] = t; top_ag_prop[t*8+2] = i; top_ag_prop[t*8+3] = j;
top_ag_prop[t*8+4] = j; top_ag_prop[t*8+5] = -1; top_ag_prop[t*8+6] = -1; top_ag_prop[t*8+7] = -1;
mat_ind_top[i*env_end+j] = t;
t+=1;
}
else if (mat[i*env_end+j] == 2)
{ bot_ag_prop[k*8] = 2; bot_ag_prop[k*8+1] = k; bot_ag_prop[k*8+2] = i; bot_ag_prop[k*8+3] = j;
bot_ag_prop[k*8+4] = j; bot_ag_prop[k*8+5] = -1; bot_ag_prop[k*8+6] = -1; bot_ag_prop[k*8+7] = -1;
mat_ind_bot[i*env_end+j] = k;
k+=1;
}
p_top[i*env_end+j] = init_p_val;
p_bot[i*env_end+j] = init_p_val;
}
}
}
__global__ void AGENT_POSITION_FUNC(int *dev_mat,double *dev_p_top,double *dev_p_bot,int *dev_top_ag_prop, int *dev_bot_ag_prop,
int *dev_top_ag_srd, int *dev_bot_ag_srd,int *dev_top_ag_ind_mat, int *dev_bot_ag_ind_mat,
double *dev_top_ag_prob, double *dev_bot_ag_prob)
{
//Maximum using 20 automatic variables
//7 registers are used, 13 left unused.
int row = blockIdx.y*blockDim.y+threadIdx.y;
int col = blockIdx.x*blockDim.x +threadIdx.x;
if (col==0)
{ printf("%f\t%f\n",dev_h_top[row*2],dev_h_top[row*2+1]);
}
int index_loc;
int ty = threadIdx.y, tx = threadIdx.x;
int by = blockIdx.y, bx= blockIdx.x;
__shared__ int mat_block_local[18][18];
__shared__ int mat_ind_local_top[16][16];
__shared__ int mat_ind_local_bot[16][16];
__shared__ double p_mat_local_top[18][18];
__shared__ double p_mat_local_bot[18][18];
//Loading of Inner elements for the main and the indices matrices of top and bottom agents
mat_block_local[ty +1][tx +1] = dev_mat[row*env_end+col];
mat_ind_local_top[ty][tx] = dev_top_ag_ind_mat[row*env_end+col];
mat_ind_local_bot[ty][tx] = dev_bot_ag_ind_mat[row*env_end+col];
//loading of phermone matrix to the local shared memory
p_mat_local_top[ty+1][tx+1] = dev_p_top[row*env_end+col];
p_mat_local_bot[ty+1][tx+1] = dev_p_bot[row*env_end+col];
if (ty<=1)
{
//Left and Right Vertical Halo elements load (without corner elements)
mat_block_local[tx+1][(blockDim.x+1)*ty] = ( ((bx+ty)*blockDim.x-(!ty)) >=env_end || ((bx+ty)*blockDim.x-(!ty)) <0)?-1:
dev_mat[(by*blockDim.y+tx)*env_end+((bx+ty)*blockDim.x-(!ty))];
p_mat_local_top[tx+1][(blockDim.x+1)*ty] = ( ((bx+ty)*blockDim.x-(!ty)) >=env_end || ((bx+ty)*blockDim.x-(!ty)) <0)?-1:
dev_p_top[(by*blockDim.y+tx)*env_end+((bx+ty)*blockDim.x-(!ty))];
p_mat_local_bot[tx+1][(blockDim.x+1)*ty] = ( ((bx+ty)*blockDim.x-(!ty)) >=env_end || ((bx+ty)*blockDim.x-(!ty)) <0)?-1:
dev_p_bot[(by*blockDim.y+tx)*env_end+((bx+ty)*blockDim.x-(!ty))];
//Top and Bottom Horizontal Halo elements load (without corner elements)
mat_block_local[(blockDim.y+1)*ty][tx+1] = ( ( (by+ty)*blockDim.y - !(ty) ) >=env_end || ((by+ty)*blockDim.y - !(ty))<0)?-1:
dev_mat[((by+ty)*blockDim.y - !(ty))*env_end+ (bx*blockDim.x+tx)];
p_mat_local_top[(blockDim.y+1)*ty][tx+1] = ( ( (by+ty)*blockDim.y - !(ty) ) >=env_end || ((by+ty)*blockDim.y - !(ty))<0)?-1:
dev_p_top[((by+ty)*blockDim.y - !(ty))*env_end+ (bx*blockDim.x+tx)];
p_mat_local_bot[(blockDim.y+1)*ty][tx+1] = ( ( (by+ty)*blockDim.y - !(ty) ) >=env_end || ((by+ty)*blockDim.y - !(ty))<0)?-1:
dev_p_bot[((by+ty)*blockDim.y - !(ty))*env_end+ (bx*blockDim.x+tx)];
//Corner halo elements load
mat_block_local[(blockDim.y+1)*ty][0] = ((bx == 0) || ((by+ty)*blockDim.y-!(ty))<0 || ((by+ty)*blockDim.y-!(ty))>=env_end)?-1:
dev_mat[((by+ty)*blockDim.y-!(ty))*env_end+(bx*blockDim.x-1)];
p_mat_local_top[(blockDim.y+1)*ty][0] = ((bx== 0) || ((by+ty)*blockDim.y-!(ty))<0 || ((by+ty)*blockDim.y-!(ty))>=env_end)?-1:
dev_p_top[((by+ty)*blockDim.y-!(ty))*env_end+(bx*blockDim.x-1)];
p_mat_local_bot[(blockDim.y+1)*ty][0] = ((bx== 0) || ((by+ty)*blockDim.y-!(ty))<0 || ((by+ty)*blockDim.y-!(ty))>=env_end)?-1:
dev_p_bot[((by+ty)*blockDim.y-!(ty))*env_end+(bx*blockDim.x-1)];
mat_block_local[(blockDim.y+1)*ty][blockDim.x+1] = ( (bx+1)*blockDim.x>=env_end || ((by+ty)*blockDim.y-!(ty))>=env_end || ((by+ty)*blockDim.y-!(ty))<0)?-1:
dev_mat[((by+ty)*blockDim.y-!(ty))*env_end+( (bx+1)*blockDim.x )];
p_mat_local_top[(blockDim.y+1)*ty][blockDim.x+1] = ( (bx+1)*blockDim.x>=env_end || ((by+ty)*blockDim.y-!(ty))>=env_end || ((by+ty)*blockDim.y-!(ty))<0)?-1:
dev_p_top[((by+ty)*blockDim.y-!(ty))*env_end+( (bx+1)*blockDim.x )];
p_mat_local_top[(blockDim.y+1)*ty][blockDim.x+1] = ( (bx+1)*blockDim.x>=env_end || ((by+ty)*blockDim.y-!(ty))>=env_end || ((by+ty)*blockDim.y-!(ty))<0)?-1:
dev_p_top[((by+ty)*blockDim.y-!(ty))*env_end+( (bx+1)*blockDim.x )];
}
__syncthreads();
if (mat_block_local[ty +1][tx+1] == 1)
{
index_loc = mat_ind_local_top[ty][tx] ;//dev_top_ag_ind_mat[row*env_end+col];
//Neighborhood store top
dev_top_ag_srd[index_loc*8] = mat_block_local[(ty+1)+1][(tx+1)-1]; dev_top_ag_srd[index_loc*8+1] = mat_block_local[(ty+1)+1][tx+1];
dev_top_ag_srd[index_loc*8+2] = mat_block_local[(ty+1)+1][(tx+1)+1]; dev_top_ag_srd[index_loc*8+3] = mat_block_local[ty+1][(tx+1)-1];
dev_top_ag_srd[index_loc*8+4] = mat_block_local[ty+1][(tx+1)+1]; dev_top_ag_srd[index_loc*8+5] = mat_block_local[(ty+1)-1][(tx+1)-1];
dev_top_ag_srd[index_loc*8+6] = mat_block_local[(ty+1)-1][tx+1]; dev_top_ag_srd[index_loc*8+7] = mat_block_local[(ty+1)-1][(tx+1)+1];
dev_top_ag_prob[index_loc*8] = p_mat_local_top[(ty+1)+1][(tx+1)-1]*(1/dev_h_top[(row+1)*2]);
dev_top_ag_prob[index_loc*8+1] = p_mat_local_top[(ty+1)+1][(tx+1)]*(1/dev_h_top[(row+1)*2+1]);
dev_top_ag_prob[index_loc*8+2] = p_mat_local_top[(ty+1)+1][(tx+1)+1]*(1/dev_h_top[(row+1)*2]);
dev_top_ag_prob[index_loc*8+3] = p_mat_local_top[(ty+1)][(tx+1)-1]*(1/dev_h_top[row*2]);
dev_top_ag_prob[index_loc*8+4] = p_mat_local_top[(ty+1)][(tx+1)+1]*(1/dev_h_top[row*2]);
dev_top_ag_prob[index_loc*8+5] = p_mat_local_top[(ty+1)-1][(tx+1)-1]*(1/dev_h_top[(row-1)*2]);
dev_top_ag_prob[index_loc*8+6] = p_mat_local_top[(ty+1)-1][(tx+1)]*(1/dev_h_top[(row-1)*2+1]);
dev_top_ag_prob[index_loc*8+7] = p_mat_local_top[(ty+1)-1][(tx+1)+1]*(1/dev_h_top[(row-1)*2]);
}
else if (mat_block_local[ty +1][tx +1] == 2)
{
index_loc = mat_ind_local_bot[ty][tx] ;//dev_bot_ag_ind_mat[row*env_end+col];
//Neighborhood store bottom
dev_bot_ag_srd[index_loc*8] = mat_block_local[(ty+1)-1][(tx+1)-1]; dev_bot_ag_srd[index_loc*8+1] = mat_block_local[(ty+1)-1][tx+1];
dev_bot_ag_srd[index_loc*8+2] = mat_block_local[(threadIdx.y+1)-1][(threadIdx.x+1)+1]; dev_bot_ag_srd[index_loc*8+3] = mat_block_local[threadIdx.y+1][(threadIdx.x+1)-1];
dev_bot_ag_srd[index_loc*8+4] = mat_block_local[threadIdx.y+1][(threadIdx.x+1)+1]; dev_bot_ag_srd[index_loc*8+5] = mat_block_local[(threadIdx.y+1)+1][(threadIdx.x+1)-1];
dev_bot_ag_srd[index_loc*8+6] = mat_block_local[(threadIdx.y+1)+1][(threadIdx.x+1)]; dev_bot_ag_srd[index_loc*8+7] = mat_block_local[(threadIdx.y+1)+1][(threadIdx.x+1)+1];
dev_bot_ag_prob[index_loc*8] = p_mat_local_bot[(threadIdx.y+1)-1][(threadIdx.x+1)-1]*(1/dev_h_bot[(row-1)*2]);
dev_bot_ag_prob[index_loc*8+1] = p_mat_local_bot[(threadIdx.y+1)-1][(threadIdx.x+1)]*(1/dev_h_bot[(row-1)*2+1]);
dev_bot_ag_prob[index_loc*8+2] = p_mat_local_bot[(threadIdx.y+1)-1][(threadIdx.x+1)+1]*(1/dev_h_bot[(row-1)*2]);
dev_bot_ag_prob[index_loc*8+3] = p_mat_local_bot[(threadIdx.y+1)][(threadIdx.x+1)-1]*(1/dev_h_bot[row*2]);
dev_bot_ag_prob[index_loc*8+4] = p_mat_local_bot[(threadIdx.y+1)][(threadIdx.x+1)+1]*(1/dev_h_bot[row*2]);
dev_bot_ag_prob[index_loc*8+5] = p_mat_local_bot[(threadIdx.y+1)+1][(threadIdx.x+1)-1]*(1/dev_h_bot[(row+1)*2]);
dev_bot_ag_prob[index_loc*8+6] = p_mat_local_bot[(threadIdx.y+1)+1][(threadIdx.x+1)]*(1/dev_h_bot[(row+1)*2]+1);
dev_bot_ag_prob[index_loc*8+7] = p_mat_local_bot[(threadIdx.y+1)+1][(threadIdx.x+1)+1]*(1/dev_h_bot[(row+1)*2]);
}
__syncthreads();
}
现在,当我调试时,我总是在dev_h_top,dev_h_bot以及其他常量变量中获得0值。我不知道发生了什么错误。我知道正确的值存储在主变量中,并且变量的复制功能中没有错误指示,但我不知道为什么没有复制所需的值。我还写了一个正常运行的虚拟程序,但我不知道我在这个程序中做了什么错。当我正在进行内核调试时,我总是在dev_top_ag_prob和dev_bot_ag_prob中得到无穷大,因为dev_h_top和dev_h_bot中的值是0.我发布的虚拟程序似乎工作正常,如果这个运行正常那么我的原始程序应该。但不幸的是,由于奇怪的原因,它不是。虚拟程序如下:
int main()
{
int num,*test_var,test_cons_var[8]={1,1,1,0,0,-1,-1,-1}, *test_res;
int *dev_test,*dev_res_var;
int i,j;
num = 32;
test_var = (int *)malloc(num*sizeof(int));
test_res = (int *)malloc(8*num*sizeof(int));
for (i=0;i<num;i++)
{ test_var[i] = rand()%(10);
printf("%d\n",test_var[i]);
}
cudaMalloc((void **)&dev_test,num*sizeof(int));
cudaMemcpy(dev_test,test_var,num*sizeof(int),cudaMemcpyHostToDevice);
cudaMalloc((void **)&dev_res_var,8*num*sizeof(int));
cudaMemcpyToSymbol(test,test_cons_var,8*sizeof(int));
test_kernel<<<1,num>>>(dev_test,dev_res_var);
cudaMemcpy(test_res,dev_res_var,8*num*sizeof(int),cudaMemcpyDeviceToHost);
printf("\n\n");
for (i=0;i<num;i++)
{ for (j=0;j<8;j++)
{printf("%d\t",test_res[i*8+j]);
}
printf("\n");
}
cudaFree(dev_test);
cudaFree(dev_res_var);
free(test_var);
free(test_res);
exit(0)
}
__global__ void test_kernel(int *dev_test,int *dev_res_var)
{
int i;
for (i=0;i<8;i++)
{ dev_res_var[threadIdx.x*8+i]=dev_test[threadIdx.x]*test[i];
}
}
非常感谢任何帮助。感谢
答案 0 :(得分:1)
正如我所说,你似乎在确认,你在这里发布的代码实际上并不是你正在运行的代码,当你运行你在这里发布的代码时,你得到了正确的结果。
如果您确实在其他模块中使用了__constant__
声明,则会出现问题。正如您在此问题中发布它们一样,它们位于同一模块中,因为标题包含在此问题中的唯一模块(.cu文件)中。但是,如果您的实际项目具有单独的.cu文件,并且您将__constant__
声明放在一个文件中但尝试在另一个文件中使用它们,则这是一个问题。 __constant__
声明have module scope only除非您将设备链接器与单独的编译一起使用。由于您的常量声明位于头文件中,因此您可能会遇到麻烦。
假设您没有使用单独的编译,请确保__constant__
声明仅在单个模块(.cu文件)中定义(包含),并确保它们的所有修改/更新/使用都发生在仅限文件。