多线程下的cudaStreamSynchronize行为

时间:2017-08-15 01:11:58

标签: multithreading cuda

以下情况中cudaStreamSynchronize的行为是什么

ThreadA pseudo code :
    while(true):
         submit new cuda Kernel to cudaStreamX

ThreadB pseudo code:
     call cudaStreamSynchronize(cudaStreamX)

我的问题是ThreadB何时返回?由于ThreadA将始终推送新的cuda内核,而cudaStreamX永远不会完成。

1 个答案:

答案 0 :(得分:1)

API documentation没有直接明确这一点,但CUDA C programming guide基本上是明确的:

  

cudaStreamSynchronize()将流作为参数并等待,直到给定流中的所有前面的命令完成

此外,我认为应该明智:

  1. cudaStreamSynchronize()无法合理地考虑在cudaStreamSynchronize()电话之后向流发布的工作。这或多或少要求它了解未来。

  2. cudaStreamSynchronize()应合理地预期在所有先前发布的工作完成后才会返回。

  3. 将实验测试应用程序放在一起,上面的描述是我观察到的:

    $ cat t396.cu
    #include <pthread.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <iostream>
    #include <unistd.h>
    
    const int PTHREADS=2;
    const int TRIGGER1=5;
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    #include <time.h>
    #include <sys/time.h>
    #define USECPSEC 1000000ULL
    
    long long dtime_usec(unsigned long long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    
    #define DELAY_T 1000000000ULL
    template <int type>
    __global__ void delay_kern(int i){
    
      unsigned long long time = clock64();
    #ifdef DEBUG
      printf("hello %d\n", type);
    #endif
      while (clock64() < time+(i*DELAY_T));
    }
    
    volatile static int flag, flag0, loop_cnt;
    
    // The thread configuration structure.
    typedef struct
    {
        int my_thread_ordinal;
        pthread_t thread;
        cudaError_t status;
        cudaStream_t stream;
        int delay_usec;
    }
    config_t;
    
    
    // The function executed by each thread assigned with CUDA device.
    void *thread_func(void *arg)
    {
        // Unpack the config structure.
        config_t *config = (config_t *)arg;
        int my_thread=config->my_thread_ordinal;
        cudaError_t cuda_status = cudaSuccess;
        cuda_status = cudaSetDevice(0);
        if (cuda_status != cudaSuccess) {
            fprintf(stderr, "Cannot set focus to device %d, status = %d\n",
                0, cuda_status);
            config->status = cuda_status;
            pthread_exit(NULL);
        }
    
    
        printf("thread %d initialized\n", my_thread);
    
        switch(config->my_thread_ordinal){
          case 0:
            //master thread
            while (flag0) {
              delay_kern<0><<<1,1,0,config->stream>>>(1);
              if (loop_cnt++ > TRIGGER1)  flag = 1;
              printf("master thread loop: %d\n", loop_cnt);
              usleep(config->delay_usec);
              }
            break;
          default:
            //slave thread
            while (!flag);
            printf("slave thread issuing stream sync at loop count: %d\n", loop_cnt);
            cudaStreamSynchronize(config->stream);
            flag0 = 0;
            printf("slave thread set trigger and exit\n");
            break;
        }
        cudaCheckErrors("thread CUDA error");
        printf("thread %d complete\n", my_thread);
        config->status = cudaSuccess;
        return NULL;
    }
    
    int main(int argc, char* argv[])
    {
        int mydelay_usec = 1;
        if (argc > 1) mydelay_usec = atoi(argv[1]);
        if ((mydelay_usec < 1) || (mydelay_usec > 10000000)) {printf("invalid delay time specified\n"); return -1;}
        flag = 0; flag0 = 1; loop_cnt = 0;
        const int nthreads = PTHREADS;
    
        // Create workers configs. Its data will be passed as
        // argument to thread_func.
        config_t* configs = (config_t*)malloc(sizeof(config_t) * nthreads);
        cudaSetDevice(0);
        cudaStream_t str;
        cudaStreamCreate(&str);
        // create a separate thread
        // and execute the thread_func.
        for (int i = 0; i < nthreads; i++) {
            config_t *config = configs + i;
            config->my_thread_ordinal = i;
            config->stream = str;
            config->delay_usec = mydelay_usec;
            int status = pthread_create(&config->thread, NULL, thread_func, config);
            if (status) {
                fprintf(stderr, "Cannot create thread for device %d, status = %d\n",
                    i, status);
            }
        }
        // Wait for device threads completion.
        // Check error status.
        int status = 0;
        for (int i = 0; i < nthreads; i++) {
            pthread_join(configs[i].thread, NULL);
            status += configs[i].status;
        }
        if (status)
            return status;
    
        free(configs);
    
        return 0;
    }
    $ nvcc -arch=sm_61 -o t396 t396.cu -lpthread
    $ time ./t396 100000
    thread 0 initialized
    thread 1 initialized
    master thread loop: 1
    master thread loop: 2
    master thread loop: 3
    master thread loop: 4
    master thread loop: 5
    master thread loop: 6
    slave thread issuing stream sync at loop count: 7
    master thread loop: 7
    master thread loop: 8
    master thread loop: 9
    master thread loop: 10
    master thread loop: 11
    master thread loop: 12
    master thread loop: 13
    master thread loop: 14
    master thread loop: 15
    master thread loop: 16
    master thread loop: 17
    master thread loop: 18
    master thread loop: 19
    master thread loop: 20
    master thread loop: 21
    master thread loop: 22
    master thread loop: 23
    master thread loop: 24
    master thread loop: 25
    master thread loop: 26
    master thread loop: 27
    master thread loop: 28
    master thread loop: 29
    master thread loop: 30
    master thread loop: 31
    master thread loop: 32
    master thread loop: 33
    master thread loop: 34
    master thread loop: 35
    master thread loop: 36
    master thread loop: 37
    master thread loop: 38
    master thread loop: 39
    slave thread set trigger and exit
    thread 1 complete
    thread 0 complete
    
    real    0m5.416s
    user    0m2.990s
    sys     0m1.623s
    $
    

    这需要仔细考虑才能理解。但是,简而言之,应用程序将发出内核,在从一个线程返回之前只执行大约0.7秒的延迟,而另一个线程将等待发出少量内核,然后发出{{1}打电话。应用程序的总体时间度量定义何时返回该调用。只要您将内核启动之间的命令行参数(主机延迟)保持在小于约0.5秒的值,那么应用程序将在大约5.4s内可靠地退出(这将取决于您运行的GPU,但是整个应用程序执行时间应该保持不变,直到相当大的主机延迟参数值。)

    如果指定的命令行参数大于计算机上的内核持续时间,则整个应用程序执行时间将约为命令行参数的5倍(微秒),因为{{1}的触发点} call是5。

    就我而言,我编译并在CUDA 8.0.61,Ubuntu 14.04,Pascal Titan X上运行。