Question

非常简单的计算内核：tmp = X * y; tmp = sigmoid（temp）-L; Y =转置（X）* TMP;但是，有时它会返回正确的结果，有时会返回错误的结果，有时会出现1000 * 1000大小的问题，它会返回正确的结果，但是当我增加问题大小时，它会返回错误的结果。它似乎有一些竞争条件。但所有数据都受到tid的限制。你能帮我找出什么是虫子吗？

谢谢！

＆＃13;

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define BLOCK_ROWS 512 

__global__ void MVM(int trows, int tcols, float *d_x, float *d_y, float *d_l, float *d_out)
{
        int tid = blockIdx.x*blockDim.x+threadIdx.x;
        if(tid < trows) {
                d_out[tid]=0;
                for(int i=0; i<tcols; i++)
                {
                        d_out[tid] = d_out[tid] + d_x[i*trows+tid]*d_y[i];
                }
                d_out[tid] = 1.0/(exp(-d_out[tid])+1.0)-d_l[tid];
         
        }

        __syncthreads;

        if(tid < tcols) {
                d_y[tid] =0; 
                for(int i=0; i<trows; i++)
                {
                        d_y[tid]  = d_y[tid] + d_x[tid*trows+i]*d_out[i];
                }
        }
}
int main(void)
{
  int trows = 100; int tcols = 100;
  float *x, *y, *out, *l, *d_x, *d_y, *d_out, *d_l, *check, *check1;
  x = (float*)malloc(trows*tcols*sizeof(float));
  y = (float*)malloc(tcols*sizeof(float));
  l = (float*)malloc(trows*sizeof(float));
  out = (float*)malloc(tcols*sizeof(float));
  check = (float*)malloc(trows*sizeof(float));
  check1 = (float*)malloc(tcols*sizeof(float));

  int result=0;
  result = cudaMalloc(&d_x, trows*tcols*sizeof(float));
  if(result!=cudaSuccess) printf("GPU allocation fail\n");
  result = cudaMalloc(&d_y, tcols*sizeof(float));
  if(result!=cudaSuccess) printf("GPU allocation fail\n");
  result = cudaMalloc(&d_out, trows*sizeof(float));
  if(result!=cudaSuccess) printf("GPU allocation fail\n");
  result = cudaMalloc(&d_l, trows*sizeof(float));
  if(result!=cudaSuccess) printf("GPU allocation fail\n");

  for(int j = 0; j <  tcols; j++) {
        for (int i = 0; i < trows; i++)
                x[j*trows+i] = (float)(i%10);
  }

  for(int i=0; i<tcols; i++) y[i] = (float)(i%10);

  for(int i=0; i<trows; i++) l[i] = (float)((trows-i)%10);

  result = cudaMemcpy(d_x, x, trows*tcols*sizeof(float), cudaMemcpyHostToDevice);
  if(result!=cudaSuccess) printf("cpying to GPU fail\n");
  result = cudaMemcpy(d_y, y, tcols*sizeof(float), cudaMemcpyHostToDevice);
  if(result!=cudaSuccess) printf("cpying to GPU fail\n");
  result = cudaMemcpy(d_l, l, trows*sizeof(float), cudaMemcpyHostToDevice);
  if(result!=cudaSuccess) printf("cpying to GPU fail\n");

  int grid=0;
  if(trows>tcols) grid = (trows-1)/BLOCK_ROWS+1; else grid = (tcols-1)/BLOCK_ROWS+1;
  dim3 dimGrid(grid,1,1);
  dim3 dimBlock(BLOCK_ROWS,1,1);

  clock_t t;
  t = clock();
  MVM<<<dimGrid, dimBlock>>>(trows, tcols, d_x, d_y, d_l, d_out);
  t = clock()-t;
  double time = ((double)t)/CLOCKS_PER_SEC;
  printf("time: %f\n", time);
  
    for(int i=0; i<trows; i++) {
        float tmp = 0;
        for(int j=0; j<tcols; j++)
                tmp += x[j*trows+i]*y[j];
        tmp = 1.0/(exp(-tmp)+1.0) - l[i];
        check[i] = tmp;
  }
  for(int i=0; i<tcols; i++) {
        float tmp = 0;
        for(int j=0; j<trows; j++)
                tmp = tmp+ x[i*trows+j]*check[j];
        check1[i] = tmp;
  }

  result = cudaMemcpy(out, d_y, tcols*sizeof(float), cudaMemcpyDeviceToHost);
  if(result!=cudaSuccess) printf("cpying to CPU fail, error=%d\n",result);

  float error=0;
  for(int i=0; i< tcols;i++) {
        error += abs(check1[i]-out[i])/(abs(check1[i])+1e-6);
  }
  printf("error = %f\n", error);

  cudaFree(d_x);
  cudaFree(d_y);
  cudaFree(d_out);
  cudaFree(d_l);
  free(x);
  free(y);
  free(l);
  free(out);
  free(check);
  free(check1);
}

＆＃13;

Answer 1

尝试为'dim3 dimBlock'更改'BLOCK_ROWS'的值，就像上面的'trows'一样。

dim3 dimBlock(trows,1,1); // instead of dim3 dimBlock(BLOCK_ROWS,1,1);

如果每个列中的元素一起处理，设置blockDimX等于行数通常可以避免由'__syncthreads（）;'

引起的问题

当我改变矩阵大小时，Cuda内核结果出错了

1 个答案: