Question

以下情况中cudaStreamSynchronize的行为是什么

ThreadA pseudo code :
    while(true):
         submit new cuda Kernel to cudaStreamX

ThreadB pseudo code:
     call cudaStreamSynchronize(cudaStreamX)

我的问题是ThreadB何时返回？由于ThreadA将始终推送新的cuda内核，而cudaStreamX永远不会完成。

Answer 1

API documentation没有直接明确这一点，但CUDA C programming guide基本上是明确的：

cudaStreamSynchronize()将流作为参数并等待，直到给定流中的所有前面的命令完成

此外，我认为应该明智：

cudaStreamSynchronize()无法合理地考虑在cudaStreamSynchronize()电话之后向流发布的工作。这或多或少要求它了解未来。
cudaStreamSynchronize()应合理地预期在所有先前发布的工作完成后才会返回。

将实验测试应用程序放在一起，上面的描述是我观察到的：

$ cat t396.cu
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <unistd.h>

const int PTHREADS=2;
const int TRIGGER1=5;

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

#define DELAY_T 1000000000ULL
template <int type>
__global__ void delay_kern(int i){

  unsigned long long time = clock64();
#ifdef DEBUG
  printf("hello %d\n", type);
#endif
  while (clock64() < time+(i*DELAY_T));
}

volatile static int flag, flag0, loop_cnt;

// The thread configuration structure.
typedef struct
{
    int my_thread_ordinal;
    pthread_t thread;
    cudaError_t status;
    cudaStream_t stream;
    int delay_usec;
}
config_t;


// The function executed by each thread assigned with CUDA device.
void *thread_func(void *arg)
{
    // Unpack the config structure.
    config_t *config = (config_t *)arg;
    int my_thread=config->my_thread_ordinal;
    cudaError_t cuda_status = cudaSuccess;
    cuda_status = cudaSetDevice(0);
    if (cuda_status != cudaSuccess) {
        fprintf(stderr, "Cannot set focus to device %d, status = %d\n",
            0, cuda_status);
        config->status = cuda_status;
        pthread_exit(NULL);
    }


    printf("thread %d initialized\n", my_thread);

    switch(config->my_thread_ordinal){
      case 0:
        //master thread
        while (flag0) {
          delay_kern<0><<<1,1,0,config->stream>>>(1);
          if (loop_cnt++ > TRIGGER1)  flag = 1;
          printf("master thread loop: %d\n", loop_cnt);
          usleep(config->delay_usec);
          }
        break;
      default:
        //slave thread
        while (!flag);
        printf("slave thread issuing stream sync at loop count: %d\n", loop_cnt);
        cudaStreamSynchronize(config->stream);
        flag0 = 0;
        printf("slave thread set trigger and exit\n");
        break;
    }
    cudaCheckErrors("thread CUDA error");
    printf("thread %d complete\n", my_thread);
    config->status = cudaSuccess;
    return NULL;
}

int main(int argc, char* argv[])
{
    int mydelay_usec = 1;
    if (argc > 1) mydelay_usec = atoi(argv[1]);
    if ((mydelay_usec < 1) || (mydelay_usec > 10000000)) {printf("invalid delay time specified\n"); return -1;}
    flag = 0; flag0 = 1; loop_cnt = 0;
    const int nthreads = PTHREADS;

    // Create workers configs. Its data will be passed as
    // argument to thread_func.
    config_t* configs = (config_t*)malloc(sizeof(config_t) * nthreads);
    cudaSetDevice(0);
    cudaStream_t str;
    cudaStreamCreate(&str);
    // create a separate thread
    // and execute the thread_func.
    for (int i = 0; i < nthreads; i++) {
        config_t *config = configs + i;
        config->my_thread_ordinal = i;
        config->stream = str;
        config->delay_usec = mydelay_usec;
        int status = pthread_create(&config->thread, NULL, thread_func, config);
        if (status) {
            fprintf(stderr, "Cannot create thread for device %d, status = %d\n",
                i, status);
        }
    }
    // Wait for device threads completion.
    // Check error status.
    int status = 0;
    for (int i = 0; i < nthreads; i++) {
        pthread_join(configs[i].thread, NULL);
        status += configs[i].status;
    }
    if (status)
        return status;

    free(configs);

    return 0;
}
$ nvcc -arch=sm_61 -o t396 t396.cu -lpthread
$ time ./t396 100000
thread 0 initialized
thread 1 initialized
master thread loop: 1
master thread loop: 2
master thread loop: 3
master thread loop: 4
master thread loop: 5
master thread loop: 6
slave thread issuing stream sync at loop count: 7
master thread loop: 7
master thread loop: 8
master thread loop: 9
master thread loop: 10
master thread loop: 11
master thread loop: 12
master thread loop: 13
master thread loop: 14
master thread loop: 15
master thread loop: 16
master thread loop: 17
master thread loop: 18
master thread loop: 19
master thread loop: 20
master thread loop: 21
master thread loop: 22
master thread loop: 23
master thread loop: 24
master thread loop: 25
master thread loop: 26
master thread loop: 27
master thread loop: 28
master thread loop: 29
master thread loop: 30
master thread loop: 31
master thread loop: 32
master thread loop: 33
master thread loop: 34
master thread loop: 35
master thread loop: 36
master thread loop: 37
master thread loop: 38
master thread loop: 39
slave thread set trigger and exit
thread 1 complete
thread 0 complete

real    0m5.416s
user    0m2.990s
sys     0m1.623s
$

这需要仔细考虑才能理解。但是，简而言之，应用程序将发出内核，在从一个线程返回之前只执行大约0.7秒的延迟，而另一个线程将等待发出少量内核，然后发出{{1}打电话。应用程序的总体时间度量定义何时返回该调用。只要您将内核启动之间的命令行参数（主机延迟）保持在小于约0.5秒的值，那么应用程序将在大约5.4s内可靠地退出（这将取决于您运行的GPU，但是整个应用程序执行时间应该保持不变，直到相当大的主机延迟参数值。）

如果指定的命令行参数大于计算机上的内核持续时间，则整个应用程序执行时间将约为命令行参数的5倍（微秒），因为{{1}的触发点} call是5。

就我而言，我编译并在CUDA 8.0.61，Ubuntu 14.04，Pascal Titan X上运行。

多线程下的cudaStreamSynchronize行为

1 个答案: