如何跟踪已执行的CUDA块?

时间:2017-04-18 18:20:38

标签: cuda

为了测试我对事物的理解,我决定修改CUDA示例中的向量添加,以便内核在特定时间后退出,然后重新启动以完成。我实现“超时”的方法是使用一个固定变量,主机在一段时间后设置为1。在内核中,执行此变量的检查以确定是否应继续执行。如果线程继续执行,则标记为完成。为了测试每个线程只执行一次,我已经修改了C[i] = C[i] + B[i]的添加,这一切都按预期工作;设备代码如下所示:

/* Function 
 *  Internal device function used for getting the current thread's global ID
 *  regardless of the block/grid configuration. It assumes that the 
 *  grid and block are 3 dimensional.
 *
 *  @return: The thread's global ID
 */
static __device__ int get_global_idx()
{
  int blockId = blockIdx.x 
    + blockIdx.y * gridDim.x 
    + gridDim.x * gridDim.y * blockIdx.z; 
  int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
    + (threadIdx.z * (blockDim.x * blockDim.y))
    + (threadIdx.y * blockDim.x)
    + threadIdx.x;
  return threadId;
}

/* Function
 *  Device function that determines if the current thread should continue execution.
 *  A check should be used on the return value. If the timeout has not been set 
 *  and the thread has not previously executed the index at the thread's ID in the
 *  thread_ids array is set to 1 to indicate it was allowed to proceed.
 *
 *  @param thread_ids:  A pointer to the array with a size that matches the max number
 *                      of threads that will be spawned
 *   
 *  @param time_out:  Memory mapped variable used by the host to signal the kernel when
 *                    execution should suspend
 *
 *  @return:  A boolean value indicating whether the current thread should continue or not
 */
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out)
{
    if(*time_out == 1){ 
      return false;
    }

    int tid = get_global_idx(); 

    if(thread_ids[tid] == 1)
    {
      return false;
    }
    thread_ids[tid] = 1;

    return true;
}

__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, unsigned int *thread_ids, volatile unsigned int *timeout)
{
    if(!continue(thread_ids, timeout))
    {
       return;
    }

    int i = blockDim.x * blockIdx.x + threadIdx.x;


    if (i < numElements)
    {
       /* C[i] = A[i] + B[i]; */
       C[i] = C[i] + B[i]; //Modifed from above
    }
}

如果使用了__syncthreads(),我考虑过这可能会失败。所以我决定做块级暂停。根据我的理解,我认为这很简单。跟踪块是否已启动,并计算已为该块执行的线程数,并且仅在已启动块的所有线程都已完成时挂起,并拒绝任何块未启动的线程。所以我使用了一个struct并修改了continue函数,如下所示:

typedef struct block_info_t{
  int started; /* Initialized to zero before any kernel launch */
  unsigned int thread_count;
}block_info;

__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out, block_info *b_info)
{
    int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
    unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;

    if(*time_out == 1 && b_info[bid].started == 0)
    { 
      return false;
    }

    if(b_info[bid].thread_count == bsize)
    {
      return false;
    }

    b_info[bid].started = 1;
    atomicInc(&b_info[bid].thread_count, bsize); 

    return true;
}

这不起作用,当我在主机上执行验证(h_B[i] - h_C[i])时,我得不到一致的零结果。这意味着某些线程以某种方式设法执行多次。任何想法如何/为什么这发生在后一种尝试?谢谢。

此时我并不关心表现;只是想了解真正发生的事情。

修改

以下是完整代码,使用nvcc file_name.cu进行编译并执行program_name <vector-length>

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

typedef struct block_info_t{
  int started; /* Initialized to zero before any kernel launch */
  unsigned int thread_count;
}block_info;

__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
    int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
    unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;

    if(*time_out == 1 && b_info[bid].started == 0)
    { 
      return false;
    }

    if(b_info[bid].thread_count == bsize)
    {
      return false;
    }

    b_info[bid].started = 1;
    atomicInc(&b_info[bid].thread_count, bsize); 

    return true;
}

__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
    if(!continue_execution(time_out, b_info))
    { 
      return;
    }

    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < numElements)
    {
       //C[i] = A[i] + B[i];
       C[i] = C[i] + B[i]; //Modified from above
    }
}

void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
  size_t i;
  for(i = 0; i < block_amt; i++)
  {
    if(h_block_info[i].started == 1)
    {
      continue;
    }
    break;
  }
  *complete = (i == block_amt) ? 1 : 0;
}

int main(int argc, char *argv[])
{
    if(argc != 2)
    {
      fprintf(stderr, "usage: <program-name> <vector-length>\n");
      exit(EXIT_FAILURE);
    }

    // Print the vector length to be used, and compute its size
    long numElements = strtol(argv[1], NULL, 10);
    size_t size = numElements * sizeof(float);
    printf("[Vector addition of %d elements]\n", numElements);

    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i)
    {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
        h_C[i] = 0.0;
    }

    float *d_A = NULL;
    cudaMalloc((void **)&d_A, size);

    float *d_B = NULL;
    cudaMalloc((void **)&d_B, size);

    float *d_C = NULL;
    cudaMalloc((void **)&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;

    size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
    block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);

    for(int i = 0; i < blocksPerGrid; i++)
    {
      h_block_info[i].started = 0;
      h_block_info[i].thread_count = 0;
    }

    block_info *d_block_info = NULL;
    cudaMalloc(&d_block_info, block_info_bytes);
    cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);

    volatile unsigned int *timeout = NULL;
    cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
    *timeout = 0;

    double quantum = 0.0001 * 1000000.0;
    double initial_quantum = quantum;

    int complete = 0;

    /* Here the kernel launch is looped until all blocks are complete */
    while(complete == 0)
    {
      vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
      usleep(quantum);
      *timeout = 1;
      cudaDeviceSynchronize();

      cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost); 
      computation_complete(&complete, blocksPerGrid, h_block_info);

      if(complete == 0)
      {
        quantum = quantum + initial_quantum;
        *timeout = 0;
      }
    }

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Verify that the result vector is correct
    for (int i = 0; i < numElements; ++i)
    {
        if (fabs(h_B[i] - h_C[i]) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    printf("Test PASSED\n");

    // Free device global memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);

    cudaDeviceReset();
    return 0;
}

1 个答案:

答案 0 :(得分:2)

您的continue_execution例程中存在竞争条件。请考虑以下情形:

  1. 线程块的warp0进入continue_execution例程。在检查变量*time_outb_info[bid].started时,它分别见证了0和0。所以它进入下一个if测试。
  2. 相同线程块的warp1进入continue_execution例程(稍后稍稍说),它分别见证变量为1和0。因此它返回false并导致warp1线程退出。
  3. warp0继续并最终将b_info[bid].started设置为1,然后更新thread_count。然后它返回true并继续向量添加。
  4. 我可以继续这样做,但我认为如果仔细考虑上述3项,你会发现这是一个你没有考虑的情况。您的隐含期望是每个线程都会读取*time_out的相干(即,给定线程块的相同)值。但是这不能保证你的代码,如果它没有这样做,那么我们最终得到一些线程块,其中一些线程已经完成了他们的工作而一些线程没有完成。

    那么我们怎么能解决这个问题呢?以上描述应指明方向。一种可能的方法是保证对于任何给定的线程块,每个线程获得*time_out相同的值,无论它是1还是0.一种可能的解决方案是对以下进行以下更改: vectorAdd内核的开头:

    __shared__ volatile unsigned int my_time_out;
    if (!threadIdx.x) my_time_out = *time_out;
    __syncthreads();
    if(!continue_execution(&my_time_out, b_info))
    

    通过这些更改,我们确保块中的每个线程都获得超时变量的连贯视图,并根据我的测试,问题得到解决:

    $ cat t100.cu
    #include <stdio.h>
    #include <stdlib.h>
    #include <unistd.h>
    
    // For the CUDA runtime routines (prefixed with "cuda_")
    #include <cuda_runtime.h>
    
    typedef struct block_info_t{
      int started; /* Initialized to zero before any kernel launch */
      unsigned int thread_count;
    }block_info;
    
    __device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
    {
        int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
        unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
    
        if(*time_out == 1 && b_info[bid].started == 0)
        {
          return false;
        }
    
        if(b_info[bid].thread_count == bsize)
        {
          return false;
        }
    
        b_info[bid].started = 1;
        atomicInc(&b_info[bid].thread_count, bsize);
    
        return true;
    }
    
    __global__ void
    vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
    {
    #ifdef USE_FIX
        __shared__ volatile unsigned int my_time_out;
        if (!threadIdx.x) my_time_out = *time_out;
        __syncthreads();
        if(!continue_execution(&my_time_out, b_info))
    #else
        if(!continue_execution(time_out, b_info))
    #endif
        {
          return;
        }
    
        int i = blockDim.x * blockIdx.x + threadIdx.x;
    
        if (i < numElements)
        {
           //C[i] = A[i] + B[i];
           C[i] = C[i] + B[i]; //Modified from above
        }
    }
    
    void computation_complete(int *complete, int block_amt, block_info *h_block_info)
    {
      size_t i;
      for(i = 0; i < block_amt; i++)
      {
        if(h_block_info[i].started == 1)
        {
          continue;
        }
        break;
      }
      *complete = (i == block_amt) ? 1 : 0;
    }
    
    int main(int argc, char *argv[])
    {
        if(argc != 2)
        {
          fprintf(stderr, "usage: <program-name> <vector-length>\n");
          exit(EXIT_FAILURE);
        }
    
        // Print the vector length to be used, and compute its size
        long numElements = strtol(argv[1], NULL, 10);
        size_t size = numElements * sizeof(float);
        printf("[Vector addition of %ld elements]\n", numElements);
    
        float *h_A = (float *)malloc(size);
        float *h_B = (float *)malloc(size);
        float *h_C = (float *)malloc(size);
    
        // Initialize the host input vectors
        for (int i = 0; i < numElements; ++i)
        {
            h_A[i] = rand()/(float)RAND_MAX;
            h_B[i] = rand()/(float)RAND_MAX;
            h_C[i] = 0.0;
        }
    
        float *d_A = NULL;
        cudaMalloc((void **)&d_A, size);
    
        float *d_B = NULL;
        cudaMalloc((void **)&d_B, size);
    
        float *d_C = NULL;
        cudaMalloc((void **)&d_C, size);
    
        cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
    
        int threadsPerBlock = 256;
        int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
    
        size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
        block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
    
        for(int i = 0; i < blocksPerGrid; i++)
        {
          h_block_info[i].started = 0;
          h_block_info[i].thread_count = 0;
        }
    
        block_info *d_block_info = NULL;
        cudaMalloc(&d_block_info, block_info_bytes);
        cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
    
        volatile unsigned int *timeout = NULL;
        cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
        *timeout = 0;
    
        double quantum = 0.0001 * 1000000.0;
        double initial_quantum = quantum;
    
        int complete = 0;
    
        /* Here the kernel launch is looped until all blocks are complete */
        while(complete == 0)
        {
          vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
          usleep(quantum);
          *timeout = 1;
          cudaDeviceSynchronize();
    
          cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
          computation_complete(&complete, blocksPerGrid, h_block_info);
    
          if(complete == 0)
          {
            quantum = quantum + initial_quantum;
            *timeout = 0;
          }
        }
    
        cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    
        // Verify that the result vector is correct
        for (int i = 0; i < numElements; ++i)
        {
            if (fabs(h_B[i] - h_C[i]) > 1e-5)
            {
                fprintf(stderr, "Result verification failed at element %d!\n", i);
                exit(EXIT_FAILURE);
            }
        }
    
        printf("Test PASSED\n");
    
        // Free device global memory
        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);
    
        free(h_A);
        free(h_B);
        free(h_C);
    
        cudaDeviceReset();
        return 0;
    }
    $ nvcc -arch=sm_61 -o t100 t100.cu
    $ ./t100 327678
    [Vector addition of 327678 elements]
    Result verification failed at element 0!
    $ nvcc -arch=sm_61 -o t100 t100.cu -DUSE_FIX
    $ ./t100 327678
    [Vector addition of 327678 elements]
    Test PASSED
    $ ./t100 327678
    [Vector addition of 327678 elements]
    Test PASSED
    $ ./t100 327678
    [Vector addition of 327678 elements]
    Test PASSED
    $
    

    我对您的代码所做的另一项更改是:

    printf("[Vector addition of %d elements]\n", numElements);
    

    这与问题无关,但您的格式说明符与您的变量类型不匹配。通过更改为%ld进行修复。