非常简单的计算内核:tmp = X * y; tmp = sigmoid(temp)-L; Y =转置(X)* TMP;但是,有时它会返回正确的结果,有时会返回错误的结果,有时会出现1000 * 1000大小的问题,它会返回正确的结果,但是当我增加问题大小时,它会返回错误的结果。它似乎有一些竞争条件。但所有数据都受到tid的限制。你能帮我找出什么是虫子吗?
谢谢!
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define BLOCK_ROWS 512
__global__ void MVM(int trows, int tcols, float *d_x, float *d_y, float *d_l, float *d_out)
{
int tid = blockIdx.x*blockDim.x+threadIdx.x;
if(tid < trows) {
d_out[tid]=0;
for(int i=0; i<tcols; i++)
{
d_out[tid] = d_out[tid] + d_x[i*trows+tid]*d_y[i];
}
d_out[tid] = 1.0/(exp(-d_out[tid])+1.0)-d_l[tid];
}
__syncthreads;
if(tid < tcols) {
d_y[tid] =0;
for(int i=0; i<trows; i++)
{
d_y[tid] = d_y[tid] + d_x[tid*trows+i]*d_out[i];
}
}
}
int main(void)
{
int trows = 100; int tcols = 100;
float *x, *y, *out, *l, *d_x, *d_y, *d_out, *d_l, *check, *check1;
x = (float*)malloc(trows*tcols*sizeof(float));
y = (float*)malloc(tcols*sizeof(float));
l = (float*)malloc(trows*sizeof(float));
out = (float*)malloc(tcols*sizeof(float));
check = (float*)malloc(trows*sizeof(float));
check1 = (float*)malloc(tcols*sizeof(float));
int result=0;
result = cudaMalloc(&d_x, trows*tcols*sizeof(float));
if(result!=cudaSuccess) printf("GPU allocation fail\n");
result = cudaMalloc(&d_y, tcols*sizeof(float));
if(result!=cudaSuccess) printf("GPU allocation fail\n");
result = cudaMalloc(&d_out, trows*sizeof(float));
if(result!=cudaSuccess) printf("GPU allocation fail\n");
result = cudaMalloc(&d_l, trows*sizeof(float));
if(result!=cudaSuccess) printf("GPU allocation fail\n");
for(int j = 0; j < tcols; j++) {
for (int i = 0; i < trows; i++)
x[j*trows+i] = (float)(i%10);
}
for(int i=0; i<tcols; i++) y[i] = (float)(i%10);
for(int i=0; i<trows; i++) l[i] = (float)((trows-i)%10);
result = cudaMemcpy(d_x, x, trows*tcols*sizeof(float), cudaMemcpyHostToDevice);
if(result!=cudaSuccess) printf("cpying to GPU fail\n");
result = cudaMemcpy(d_y, y, tcols*sizeof(float), cudaMemcpyHostToDevice);
if(result!=cudaSuccess) printf("cpying to GPU fail\n");
result = cudaMemcpy(d_l, l, trows*sizeof(float), cudaMemcpyHostToDevice);
if(result!=cudaSuccess) printf("cpying to GPU fail\n");
int grid=0;
if(trows>tcols) grid = (trows-1)/BLOCK_ROWS+1; else grid = (tcols-1)/BLOCK_ROWS+1;
dim3 dimGrid(grid,1,1);
dim3 dimBlock(BLOCK_ROWS,1,1);
clock_t t;
t = clock();
MVM<<<dimGrid, dimBlock>>>(trows, tcols, d_x, d_y, d_l, d_out);
t = clock()-t;
double time = ((double)t)/CLOCKS_PER_SEC;
printf("time: %f\n", time);
for(int i=0; i<trows; i++) {
float tmp = 0;
for(int j=0; j<tcols; j++)
tmp += x[j*trows+i]*y[j];
tmp = 1.0/(exp(-tmp)+1.0) - l[i];
check[i] = tmp;
}
for(int i=0; i<tcols; i++) {
float tmp = 0;
for(int j=0; j<trows; j++)
tmp = tmp+ x[i*trows+j]*check[j];
check1[i] = tmp;
}
result = cudaMemcpy(out, d_y, tcols*sizeof(float), cudaMemcpyDeviceToHost);
if(result!=cudaSuccess) printf("cpying to CPU fail, error=%d\n",result);
float error=0;
for(int i=0; i< tcols;i++) {
error += abs(check1[i]-out[i])/(abs(check1[i])+1e-6);
}
printf("error = %f\n", error);
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_out);
cudaFree(d_l);
free(x);
free(y);
free(l);
free(out);
free(check);
free(check1);
}
&#13;
答案 0 :(得分:-1)
尝试为'dim3 dimBlock'更改'BLOCK_ROWS'的值,就像上面的'trows'一样。
dim3 dimBlock(trows,1,1); // instead of dim3 dimBlock(BLOCK_ROWS,1,1);
如果每个列中的元素一起处理,设置blockDimX等于行数通常可以避免由'__syncthreads();'
引起的问题