以下情况中cudaStreamSynchronize的行为是什么
ThreadA pseudo code :
while(true):
submit new cuda Kernel to cudaStreamX
ThreadB pseudo code:
call cudaStreamSynchronize(cudaStreamX)
我的问题是ThreadB何时返回?由于ThreadA将始终推送新的cuda内核,而cudaStreamX永远不会完成。
答案 0 :(得分:1)
API documentation没有直接明确这一点,但CUDA C programming guide基本上是明确的:
cudaStreamSynchronize()
将流作为参数并等待,直到给定流中的所有前面的命令完成
此外,我认为应该明智:
cudaStreamSynchronize()
无法合理地考虑在cudaStreamSynchronize()
电话之后向流发布的工作。这或多或少要求它了解未来。
cudaStreamSynchronize()
应合理地预期在所有先前发布的工作完成后才会返回。
将实验测试应用程序放在一起,上面的描述是我观察到的:
$ cat t396.cu
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <unistd.h>
const int PTHREADS=2;
const int TRIGGER1=5;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
#define DELAY_T 1000000000ULL
template <int type>
__global__ void delay_kern(int i){
unsigned long long time = clock64();
#ifdef DEBUG
printf("hello %d\n", type);
#endif
while (clock64() < time+(i*DELAY_T));
}
volatile static int flag, flag0, loop_cnt;
// The thread configuration structure.
typedef struct
{
int my_thread_ordinal;
pthread_t thread;
cudaError_t status;
cudaStream_t stream;
int delay_usec;
}
config_t;
// The function executed by each thread assigned with CUDA device.
void *thread_func(void *arg)
{
// Unpack the config structure.
config_t *config = (config_t *)arg;
int my_thread=config->my_thread_ordinal;
cudaError_t cuda_status = cudaSuccess;
cuda_status = cudaSetDevice(0);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot set focus to device %d, status = %d\n",
0, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}
printf("thread %d initialized\n", my_thread);
switch(config->my_thread_ordinal){
case 0:
//master thread
while (flag0) {
delay_kern<0><<<1,1,0,config->stream>>>(1);
if (loop_cnt++ > TRIGGER1) flag = 1;
printf("master thread loop: %d\n", loop_cnt);
usleep(config->delay_usec);
}
break;
default:
//slave thread
while (!flag);
printf("slave thread issuing stream sync at loop count: %d\n", loop_cnt);
cudaStreamSynchronize(config->stream);
flag0 = 0;
printf("slave thread set trigger and exit\n");
break;
}
cudaCheckErrors("thread CUDA error");
printf("thread %d complete\n", my_thread);
config->status = cudaSuccess;
return NULL;
}
int main(int argc, char* argv[])
{
int mydelay_usec = 1;
if (argc > 1) mydelay_usec = atoi(argv[1]);
if ((mydelay_usec < 1) || (mydelay_usec > 10000000)) {printf("invalid delay time specified\n"); return -1;}
flag = 0; flag0 = 1; loop_cnt = 0;
const int nthreads = PTHREADS;
// Create workers configs. Its data will be passed as
// argument to thread_func.
config_t* configs = (config_t*)malloc(sizeof(config_t) * nthreads);
cudaSetDevice(0);
cudaStream_t str;
cudaStreamCreate(&str);
// create a separate thread
// and execute the thread_func.
for (int i = 0; i < nthreads; i++) {
config_t *config = configs + i;
config->my_thread_ordinal = i;
config->stream = str;
config->delay_usec = mydelay_usec;
int status = pthread_create(&config->thread, NULL, thread_func, config);
if (status) {
fprintf(stderr, "Cannot create thread for device %d, status = %d\n",
i, status);
}
}
// Wait for device threads completion.
// Check error status.
int status = 0;
for (int i = 0; i < nthreads; i++) {
pthread_join(configs[i].thread, NULL);
status += configs[i].status;
}
if (status)
return status;
free(configs);
return 0;
}
$ nvcc -arch=sm_61 -o t396 t396.cu -lpthread
$ time ./t396 100000
thread 0 initialized
thread 1 initialized
master thread loop: 1
master thread loop: 2
master thread loop: 3
master thread loop: 4
master thread loop: 5
master thread loop: 6
slave thread issuing stream sync at loop count: 7
master thread loop: 7
master thread loop: 8
master thread loop: 9
master thread loop: 10
master thread loop: 11
master thread loop: 12
master thread loop: 13
master thread loop: 14
master thread loop: 15
master thread loop: 16
master thread loop: 17
master thread loop: 18
master thread loop: 19
master thread loop: 20
master thread loop: 21
master thread loop: 22
master thread loop: 23
master thread loop: 24
master thread loop: 25
master thread loop: 26
master thread loop: 27
master thread loop: 28
master thread loop: 29
master thread loop: 30
master thread loop: 31
master thread loop: 32
master thread loop: 33
master thread loop: 34
master thread loop: 35
master thread loop: 36
master thread loop: 37
master thread loop: 38
master thread loop: 39
slave thread set trigger and exit
thread 1 complete
thread 0 complete
real 0m5.416s
user 0m2.990s
sys 0m1.623s
$
这需要仔细考虑才能理解。但是,简而言之,应用程序将发出内核,在从一个线程返回之前只执行大约0.7秒的延迟,而另一个线程将等待发出少量内核,然后发出{{1}打电话。应用程序的总体时间度量定义何时返回该调用。只要您将内核启动之间的命令行参数(主机延迟)保持在小于约0.5秒的值,那么应用程序将在大约5.4s内可靠地退出(这将取决于您运行的GPU,但是整个应用程序执行时间应该保持不变,直到相当大的主机延迟参数值。)
如果指定的命令行参数大于计算机上的内核持续时间,则整个应用程序执行时间将约为命令行参数的5倍(微秒),因为{{1}的触发点} call是5。
就我而言,我编译并在CUDA 8.0.61,Ubuntu 14.04,Pascal Titan X上运行。