Question

我在不同的上下文中有两个流A和B（不同的设备，不同的主机线程，自动创建）。我需要将流A中的内核K的执行与流B中的2个不同事件同步，以便K在事件1被触发但不是事件2之后启动。原则上是否可能？（使用cudaStreamWaitEvent？）

我的流B包含一个循环，因此我有一系列事件，如1,2,1,2,1,2,1,2等。但是我的内核K在单独的流中应该只在1和2之间开始，而不是在2之后和1之前。

示例：

Host thread X:
Loop:
  kernel1(userStreamA)
  ...
  kernelK(userStreamA) <-- this should start only between E1 and E2
  ...
  kernelN(userStreamA)

Host thread Y: 
Loop:
  kernel1(userStreamB)
  record(E1)
  kernel2(userStreamB)
  some other kernels in userStreamB
  record(E2)
  kernel3(userStreamB)
  ...
  kernelN(userStreamB)

Answer 1

据我所知，CUDA可能不是使用主机并发调用以您需要的方式同步线程/进程的最佳选择（即，对特定主机情况进行如此细粒度的控制）。如果您的问题仅限于使用CUDA调用，那么您可以忽略前面给出的替代解决方案。

您可以使用OpenMP或MPI轻松实现所需，这将为您提供高级工具来控制线程和进程的CPU同步/执行流程，而不会损害GPU内核。

提示：考虑OpenMP同步障碍或MPI等待/发送/接收消息。

Answer 2

我看到的基本问题（可能是您在上一个问题中得到的）是cudaStreamWaitEvent如果事件尚未“记录”，则无效。

因此，为了解决这个问题，我建议在两个线程之间添加一个共享标志或信号量，这将确保streamA中的cudaStreamWaitEvent不会在streamB中的相应cudaEventRecord之前发出，反之亦然（在这种情况下我们可以使用相同的标志）。

以下代码使用pthreads实现此目的：

$ cat t884.cu
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <unistd.h>

#define N_LOOPS 3
#define MAX_EVENT  4
#define MAX_STREAM 4
#define PTHREADS 2

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

#define DELAY_T 1000000000ULL
template <int type>
__global__ void delay_kern(int i){

  unsigned long long time = clock64();
#ifdef DEBUG
  printf("hello %d\n", type);
#endif
  while (clock64() < time+(i*DELAY_T));
}

static int flag;

// The thread configuration structure.
typedef struct
{
    int device;
    int my_thread_ordinal;
    pthread_t thread;
    cudaError_t status;
    cudaStream_t *streams[MAX_STREAM];
    cudaEvent_t  *events[MAX_EVENT];
}
config_t;


// The function executed by each thread assigned with CUDA device.
void *thread_func(void *arg)
{
    // Unpack the config structure.
    config_t *config = (config_t *)arg;

    int device = config->device;
    int my_thread=config->my_thread_ordinal;
    cudaError_t cuda_status = cudaSuccess;
    cuda_status = cudaSetDevice(device);
    if (cuda_status != cudaSuccess) {
        fprintf(stderr, "Cannot set focus to device %d, status = %d\n",
            device, cuda_status);
        config->status = cuda_status;
        pthread_exit(NULL);
    }


    printf("thread %d initialized\n", my_thread);

    switch(config->my_thread_ordinal){
      case 0:
        //master thread - thread Y
        for (int i = 0; i < N_LOOPS; i++){
          delay_kern<0><<<1,1,0,*(config->streams[0])>>>(1);
          cudaEventRecord(*(config->events[0]), *(config->streams[0]));
          flag = 1;
          delay_kern<1><<<1,1,0,*(config->streams[0])>>>(1);
          while (flag == 1){};
          cudaStreamWaitEvent(*(config->streams[0]), *(config->events[2]),0);
          cudaEventRecord(*(config->events[1]), *(config->streams[0]));
          delay_kern<2><<<1,1,0,*(config->streams[0])>>>(1);
          }
        break;
      default:
        //slave thread - thread X
        for (int i = 0; i < N_LOOPS; i++){
          delay_kern<3><<<1,1,0,*(config->streams[1])>>>(1);
          while (flag == 0){};
          cudaStreamWaitEvent(*(config->streams[1]), *(config->events[0]),0);
          delay_kern<4><<<1,1,0,*(config->streams[1])>>>(1);
          cudaEventRecord(*(config->events[2]), *(config->streams[1]));
          flag = 0;
          delay_kern<5><<<1,1,0,*(config->streams[1])>>>(1);
          }
        break;
    }
    cudaDeviceSynchronize();
    cudaCheckErrors("thread CUDA error");
    printf("thread %d complete\n", my_thread);
    config->status = cudaSuccess;
    return NULL;
}

int main(int argc, char* argv[])
{
    flag = 0;
    const int nthreads = PTHREADS;

    // Create workers configs. Its data will be passed as
    // argument to thread_func.
    config_t* configs = (config_t*)malloc(sizeof(config_t) * nthreads);
    cudaStream_t s[MAX_STREAM];
    cudaEvent_t  e[MAX_EVENT];
    cudaSetDevice(0);
    cudaStreamCreate(s+0);
    cudaEventCreate(e+0);
    cudaEventCreate(e+1);
    cudaSetDevice(1);
    cudaStreamCreate(s+1);
    cudaEventCreate(e+2);
    // create a separate thread
    // and execute the thread_func.
    for (int i = 0; i < nthreads; i++) {
        config_t *config = configs + i;
        config->device = i;
        config->my_thread_ordinal = i;
        for (int j = 0; j < PTHREADS; j++) config->streams[j] = s+j;
        for (int j = 0; j < PTHREADS+1; j++) config->events[j] = e+j;
        int status = pthread_create(&config->thread, NULL, thread_func, config);
        if (status) {
            fprintf(stderr, "Cannot create thread for device %d, status = %d\n",
                i, status);
        }
    }
    // Wait for device threads completion.
    // Check error status.
    int status = 0;
    for (int i = 0; i < nthreads; i++) {
        pthread_join(configs[i].thread, NULL);
        status += configs[i].status;
    }
    if (status)
        return status;

    free(configs);

    return 0;
}
$ nvcc -o t884 t884.cu -lpthread
$ time ./t884
thread 1 initialized
thread 0 initialized
thread 1 complete
thread 0 complete

real    0m9.738s
user    0m12.102s
sys     0m6.235s
$

内核是模板化的，因此我们可以在分析器中更容易区分事物。内核本身旨在简单地实现~1s的延迟（这将在下面的分析器输出中显而易见）。由于每个设备总共启动了9个内核，因此我们只需观察总执行时间为~9s即可粗略地观察到良好的重叠/并发性。我已经实现了“第三”事件“另一种方式”，以确保在内核K完成之前不会发生流B中事件E2的记录。但是要见证您在流A中的内核K与流B中记录的事件之间寻找的特定同步，我们需要使用分析器来查看它：

$ nvprof --print-gpu-trace ./t884
==14914== NVPROF is profiling process 14914, command: ./t884
thread 0 initialized
thread 1 initialized
thread 1 complete
thread 0 complete
==14914== Profiling application: ./t884
==14914== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
887.60ms  974.65ms              (1 1 1)         (1 1 1)         7        0B        0B         -           -  Quadro 5000 (0)         1        13  void delay_kern<int=0>(int) [188]
887.64ms  956.79ms              (1 1 1)         (1 1 1)         8        0B        0B         -           -  GeForce GT 640          2        21  void delay_kern<int=3>(int) [192]
1.86225s  974.65ms              (1 1 1)         (1 1 1)         7        0B        0B         -           -  Quadro 5000 (0)         1        13  void delay_kern<int=1>(int) [195]
1.86225s  956.79ms              (1 1 1)         (1 1 1)         8        0B        0B         -           -  GeForce GT 640          2        21  void delay_kern<int=4>(int) [199]
2.81905s  956.79ms              (1 1 1)         (1 1 1)         8        0B        0B         -           -  GeForce GT 640          2        21  void delay_kern<int=5>(int) [204]
2.83690s  974.65ms              (1 1 1)         (1 1 1)         7        0B        0B         -           -  Quadro 5000 (0)         1        13  void delay_kern<int=2>(int) [208]
3.77584s  956.79ms              (1 1 1)         (1 1 1)         8        0B        0B         -           -  GeForce GT 640          2        21  void delay_kern<int=3>(int) [212]
3.81155s  974.65ms              (1 1 1)         (1 1 1)         7        0B        0B         -           -  Quadro 5000 (0)         1        13  void delay_kern<int=0>(int) [214]
4.78619s  974.65ms              (1 1 1)         (1 1 1)         7        0B        0B         -           -  Quadro 5000 (0)         1        13  void delay_kern<int=1>(int) [219]
4.78620s  956.79ms              (1 1 1)         (1 1 1)         8        0B        0B         -           -  GeForce GT 640          2        21  void delay_kern<int=4>(int) [222]
5.74300s  956.79ms              (1 1 1)         (1 1 1)         8        0B        0B         -           -  GeForce GT 640          2        21  void delay_kern<int=5>(int) [227]
5.76084s  974.65ms              (1 1 1)         (1 1 1)         7        0B        0B         -           -  Quadro 5000 (0)         1        13  void delay_kern<int=2>(int) [231]
6.69979s  956.79ms              (1 1 1)         (1 1 1)         8        0B        0B         -           -  GeForce GT 640          2        21  void delay_kern<int=3>(int) [235]
6.73549s  974.65ms              (1 1 1)         (1 1 1)         7        0B        0B         -           -  Quadro 5000 (0)         1        13  void delay_kern<int=0>(int) [237]
7.71014s  974.65ms              (1 1 1)         (1 1 1)         7        0B        0B         -           -  Quadro 5000 (0)         1        13  void delay_kern<int=1>(int) [242]
7.71015s  956.79ms              (1 1 1)         (1 1 1)         8        0B        0B         -           -  GeForce GT 640          2        21  void delay_kern<int=4>(int) [245]
8.66694s  956.79ms              (1 1 1)         (1 1 1)         8        0B        0B         -           -  GeForce GT 640          2        21  void delay_kern<int=5>(int) [250]
8.68479s  974.65ms              (1 1 1)         (1 1 1)         7        0B        0B         -           -  Quadro 5000 (0)         1        13  void delay_kern<int=2>(int) [254]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
$

我们在循环的所有3次迭代中看到的是，一个设备上的delay_kern<int=1>和另一个设备上的delay_kern<int=4>（实际上是你的内核K）具有几乎完全相同的开始时间。这让我非常有信心，事件行为正在强制执行所需的行为。

此提议中强加的线程同步具有以下缺点：我们不再能够通过主机线程发出长时间的异步活动（尽管我们仍然在设备上实现处理并发，具有所需的流同步）。但是，如果没有线程同步的某些元素，我真的没有办法强制执行你想要的行为。

是否可以在CUDA中在流B中的两个事件之间的流A中启动内核？

2 个答案: