Question

我现在只需要展示矩阵乘法的中间进展。

for(unsigned int col=0; col<mtxSize; col++) {
         unsigned tmp = 0;
         for(unsigned int row=0; row<mtxSize; row++) {
             for(unsigned int idx=0; idx<mtxSize; idx++) {
                 tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row];
            }
             h_Rs[col*mtxSize+row] = tmp;
             tmp = 0;
             int rate_tmp = (col*mtxSize + (row+1))*100;
             // Maybe like this...
             fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize);
             fflush(stdout);
         }
}

在主机代码（使用CPU）的情况下，它非常容易，因为它按顺序处理，所以我们可以很容易地检查。

但是在GPU并行处理的情况下，我该怎么办？

内核运行后，在完成内核执行之前不会返回。

所以我无法在内核执行期间检查中间数据。

我想我需要使用异步内核调用，但我不太清楚。

即使使用异步内核调用，要将所有数据看到处理器上的几个块，我是否必须编写atomicAdd（）（换句话说，全局内存访问）函数，其中包括一些开销？

给我一些建议或提示。

我想知道CUDA的情况。

Answer 1

这是一个代码，演示了如何从矩阵乘法内核检查进度：

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void mykernel(volatile int *data){

  unsigned long time;
  for (int i = 0; i < INCS; i++){
    atomicAdd((int *)data,1);
    __threadfence_system();
    time = clock64();
    while((clock64() - time)<TIME_INC) {};
    }
  printf("progress check finished\n");
}

__global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){
  unsigned int row = threadIdx.x+blockDim.x*blockIdx.x;
  unsigned int col = threadIdx.y+blockDim.y*blockIdx.y;
  if ((row < rowA) && (col < colB)){
    float temp = 0.0f;
    for (unsigned int k = 0; k < colA; k++)
      temp += a[(row*colA)+k] * b[(k*colB) + col];
    c[(row*colB)+col] = temp;
#if USE_PROGRESS
    if (!(threadIdx.x || threadIdx.y)){
      atomicAdd((int *)progress, 1);
      __threadfence_system();
      }
#endif
  }
}

int main(){
// simple test to demonstrate reading progress data from kernel
  volatile int *d_data, *h_data;
  cudaSetDeviceFlags(cudaDeviceMapHost);
  cudaCheckErrors("cudaSetDeviceFlags error");
  cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
  cudaCheckErrors("cudaHostAlloc error");
  cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
  cudaCheckErrors("cudaHostGetDevicePointer error");
  *h_data = 0;
  printf("kernel starting\n");
  mykernel<<<1,1>>>(d_data);
  cudaCheckErrors("kernel fail");
  int value = 0;
  do{
    int value1 = *h_data;
    if (value1 > value){
       printf("h_data = %d\n", value1);
       value = value1;}}
    while (value < (INCS-1));
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail 2");

// now try matrix multiply with progress

  float *h_c, *d_a, *d_b, *d_c;
  h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float));
  if (h_c == NULL) {printf("malloc fail\n"); return 1;}
  cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float));
  cudaCheckErrors("cudaMalloc a fail");
  cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float));
  cudaCheckErrors("cudaMalloc b fail");
  cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float));
  cudaCheckErrors("cudaMalloc c fail");

  for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX;
  cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy a fail");
  cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy b fail");

  cudaEvent_t start, stop;
  cudaEventCreate(&start); cudaEventCreate(&stop);
  *h_data=0;
  dim3 block(16,16);
  dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y));
  printf("matrix multiply kernel starting\n");
  cudaEventRecord(start);
  matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
  cudaEventRecord(stop);
#if USE_PROGRESS
  unsigned int num_blocks = grid.x*grid.y;
  float my_progress = 0.0f;
  value = 0;
  printf("Progress:\n");
  do{
    int value1 = *h_data;
    float kern_progress = (float)value1/(float)num_blocks;
    if ((kern_progress - my_progress)> 0.1f) {
      printf("percent complete = %2.1f\n", (kern_progress*100));
      my_progress = kern_progress;}}
    while (my_progress < 0.9f);
  printf("\n");
#endif
  cudaEventSynchronize(stop);
  cudaCheckErrors("event sync fail");
  float et;
  cudaEventElapsedTime(&et, start, stop);
  cudaCheckErrors("event elapsed time fail");
  cudaDeviceSynchronize();
  cudaCheckErrors("mat mult kernel fail");
  printf("matrix multiply finished.  elapsed time = %f milliseconds\n", et);


  return 0;
}

与第一个内核调用相关的代码只是为了演示让内核报告它的进度的基本思路。

代码的第二部分显示了GPU上的一个样本，天真矩阵乘法，GPU报告了它的进度。我已经包含了通过预处理器宏删除进度检查代码的能力，以及为矩阵乘法内核计时的能力。对于我在这里的情况，无论是否有进度代码，时间上都没有明显的区别。因此，虽然进度报告代码可能会添加一些开销，但与合理大小的矩阵乘法内核的范围相比，它没有增加我可以看到的重要时间。

如何检查矩阵乘法的进度？

1 个答案: