我在不同的上下文中有两个流A和B(不同的设备,不同的主机线程,自动创建)。我需要将流A中的内核K的执行与流B中的2个不同事件同步,以便K在事件1被触发但不是事件2之后启动。原则上是否可能? (使用cudaStreamWaitEvent?)
我的流B包含一个循环,因此我有一系列事件,如1,2,1,2,1,2,1,2等。但是我的内核K在单独的流中应该只在1和2之间开始,而不是在2之后和1之前。
示例:
Host thread X:
Loop:
kernel1(userStreamA)
...
kernelK(userStreamA) <-- this should start only between E1 and E2
...
kernelN(userStreamA)
Host thread Y:
Loop:
kernel1(userStreamB)
record(E1)
kernel2(userStreamB)
some other kernels in userStreamB
record(E2)
kernel3(userStreamB)
...
kernelN(userStreamB)
答案 0 :(得分:1)
据我所知,CUDA可能不是使用主机并发调用以您需要的方式同步线程/进程的最佳选择(即,对特定主机情况进行如此细粒度的控制)。如果您的问题仅限于使用CUDA调用,那么您可以忽略前面给出的替代解决方案。
您可以使用OpenMP或MPI轻松实现所需,这将为您提供高级工具来控制线程和进程的CPU同步/执行流程,而不会损害GPU内核。
提示:考虑OpenMP同步障碍或MPI等待/发送/接收消息。
答案 1 :(得分:1)
我看到的基本问题(可能是您在上一个问题中得到的)是cudaStreamWaitEvent
如果事件尚未“记录”,则无效。
因此,为了解决这个问题,我建议在两个线程之间添加一个共享标志或信号量,这将确保streamA中的cudaStreamWaitEvent
不会在streamB中的相应cudaEventRecord
之前发出,反之亦然(在这种情况下我们可以使用相同的标志)。
以下代码使用pthreads实现此目的:
$ cat t884.cu
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <unistd.h>
#define N_LOOPS 3
#define MAX_EVENT 4
#define MAX_STREAM 4
#define PTHREADS 2
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
#define DELAY_T 1000000000ULL
template <int type>
__global__ void delay_kern(int i){
unsigned long long time = clock64();
#ifdef DEBUG
printf("hello %d\n", type);
#endif
while (clock64() < time+(i*DELAY_T));
}
static int flag;
// The thread configuration structure.
typedef struct
{
int device;
int my_thread_ordinal;
pthread_t thread;
cudaError_t status;
cudaStream_t *streams[MAX_STREAM];
cudaEvent_t *events[MAX_EVENT];
}
config_t;
// The function executed by each thread assigned with CUDA device.
void *thread_func(void *arg)
{
// Unpack the config structure.
config_t *config = (config_t *)arg;
int device = config->device;
int my_thread=config->my_thread_ordinal;
cudaError_t cuda_status = cudaSuccess;
cuda_status = cudaSetDevice(device);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot set focus to device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}
printf("thread %d initialized\n", my_thread);
switch(config->my_thread_ordinal){
case 0:
//master thread - thread Y
for (int i = 0; i < N_LOOPS; i++){
delay_kern<0><<<1,1,0,*(config->streams[0])>>>(1);
cudaEventRecord(*(config->events[0]), *(config->streams[0]));
flag = 1;
delay_kern<1><<<1,1,0,*(config->streams[0])>>>(1);
while (flag == 1){};
cudaStreamWaitEvent(*(config->streams[0]), *(config->events[2]),0);
cudaEventRecord(*(config->events[1]), *(config->streams[0]));
delay_kern<2><<<1,1,0,*(config->streams[0])>>>(1);
}
break;
default:
//slave thread - thread X
for (int i = 0; i < N_LOOPS; i++){
delay_kern<3><<<1,1,0,*(config->streams[1])>>>(1);
while (flag == 0){};
cudaStreamWaitEvent(*(config->streams[1]), *(config->events[0]),0);
delay_kern<4><<<1,1,0,*(config->streams[1])>>>(1);
cudaEventRecord(*(config->events[2]), *(config->streams[1]));
flag = 0;
delay_kern<5><<<1,1,0,*(config->streams[1])>>>(1);
}
break;
}
cudaDeviceSynchronize();
cudaCheckErrors("thread CUDA error");
printf("thread %d complete\n", my_thread);
config->status = cudaSuccess;
return NULL;
}
int main(int argc, char* argv[])
{
flag = 0;
const int nthreads = PTHREADS;
// Create workers configs. Its data will be passed as
// argument to thread_func.
config_t* configs = (config_t*)malloc(sizeof(config_t) * nthreads);
cudaStream_t s[MAX_STREAM];
cudaEvent_t e[MAX_EVENT];
cudaSetDevice(0);
cudaStreamCreate(s+0);
cudaEventCreate(e+0);
cudaEventCreate(e+1);
cudaSetDevice(1);
cudaStreamCreate(s+1);
cudaEventCreate(e+2);
// create a separate thread
// and execute the thread_func.
for (int i = 0; i < nthreads; i++) {
config_t *config = configs + i;
config->device = i;
config->my_thread_ordinal = i;
for (int j = 0; j < PTHREADS; j++) config->streams[j] = s+j;
for (int j = 0; j < PTHREADS+1; j++) config->events[j] = e+j;
int status = pthread_create(&config->thread, NULL, thread_func, config);
if (status) {
fprintf(stderr, "Cannot create thread for device %d, status = %d\n",
i, status);
}
}
// Wait for device threads completion.
// Check error status.
int status = 0;
for (int i = 0; i < nthreads; i++) {
pthread_join(configs[i].thread, NULL);
status += configs[i].status;
}
if (status)
return status;
free(configs);
return 0;
}
$ nvcc -o t884 t884.cu -lpthread
$ time ./t884
thread 1 initialized
thread 0 initialized
thread 1 complete
thread 0 complete
real 0m9.738s
user 0m12.102s
sys 0m6.235s
$
内核是模板化的,因此我们可以在分析器中更容易区分事物。内核本身旨在简单地实现~1s的延迟(这将在下面的分析器输出中显而易见)。由于每个设备总共启动了9个内核,因此我们只需观察总执行时间为~9s即可粗略地观察到良好的重叠/并发性。我已经实现了“第三”事件“另一种方式”,以确保在内核K完成之前不会发生流B中事件E2的记录。但是要见证您在流A中的内核K与流B中记录的事件之间寻找的特定同步,我们需要使用分析器来查看它:
$ nvprof --print-gpu-trace ./t884
==14914== NVPROF is profiling process 14914, command: ./t884
thread 0 initialized
thread 1 initialized
thread 1 complete
thread 0 complete
==14914== Profiling application: ./t884
==14914== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
887.60ms 974.65ms (1 1 1) (1 1 1) 7 0B 0B - - Quadro 5000 (0) 1 13 void delay_kern<int=0>(int) [188]
887.64ms 956.79ms (1 1 1) (1 1 1) 8 0B 0B - - GeForce GT 640 2 21 void delay_kern<int=3>(int) [192]
1.86225s 974.65ms (1 1 1) (1 1 1) 7 0B 0B - - Quadro 5000 (0) 1 13 void delay_kern<int=1>(int) [195]
1.86225s 956.79ms (1 1 1) (1 1 1) 8 0B 0B - - GeForce GT 640 2 21 void delay_kern<int=4>(int) [199]
2.81905s 956.79ms (1 1 1) (1 1 1) 8 0B 0B - - GeForce GT 640 2 21 void delay_kern<int=5>(int) [204]
2.83690s 974.65ms (1 1 1) (1 1 1) 7 0B 0B - - Quadro 5000 (0) 1 13 void delay_kern<int=2>(int) [208]
3.77584s 956.79ms (1 1 1) (1 1 1) 8 0B 0B - - GeForce GT 640 2 21 void delay_kern<int=3>(int) [212]
3.81155s 974.65ms (1 1 1) (1 1 1) 7 0B 0B - - Quadro 5000 (0) 1 13 void delay_kern<int=0>(int) [214]
4.78619s 974.65ms (1 1 1) (1 1 1) 7 0B 0B - - Quadro 5000 (0) 1 13 void delay_kern<int=1>(int) [219]
4.78620s 956.79ms (1 1 1) (1 1 1) 8 0B 0B - - GeForce GT 640 2 21 void delay_kern<int=4>(int) [222]
5.74300s 956.79ms (1 1 1) (1 1 1) 8 0B 0B - - GeForce GT 640 2 21 void delay_kern<int=5>(int) [227]
5.76084s 974.65ms (1 1 1) (1 1 1) 7 0B 0B - - Quadro 5000 (0) 1 13 void delay_kern<int=2>(int) [231]
6.69979s 956.79ms (1 1 1) (1 1 1) 8 0B 0B - - GeForce GT 640 2 21 void delay_kern<int=3>(int) [235]
6.73549s 974.65ms (1 1 1) (1 1 1) 7 0B 0B - - Quadro 5000 (0) 1 13 void delay_kern<int=0>(int) [237]
7.71014s 974.65ms (1 1 1) (1 1 1) 7 0B 0B - - Quadro 5000 (0) 1 13 void delay_kern<int=1>(int) [242]
7.71015s 956.79ms (1 1 1) (1 1 1) 8 0B 0B - - GeForce GT 640 2 21 void delay_kern<int=4>(int) [245]
8.66694s 956.79ms (1 1 1) (1 1 1) 8 0B 0B - - GeForce GT 640 2 21 void delay_kern<int=5>(int) [250]
8.68479s 974.65ms (1 1 1) (1 1 1) 7 0B 0B - - Quadro 5000 (0) 1 13 void delay_kern<int=2>(int) [254]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
$
我们在循环的所有3次迭代中看到的是,一个设备上的delay_kern<int=1>
和另一个设备上的delay_kern<int=4>
(实际上是你的内核K)具有几乎完全相同的开始时间。这让我非常有信心,事件行为正在强制执行所需的行为。
此提议中强加的线程同步具有以下缺点:我们不再能够通过主机线程发出长时间的异步活动(尽管我们仍然在设备上实现处理并发,具有所需的流同步)。但是,如果没有线程同步的某些元素,我真的没有办法强制执行你想要的行为。