我在特斯拉C2070上使用CUDA 4.1和CUPTI。
代码有2个线程。第一个线程启动一个长内核并等待cudaDeviceSynchronize(),然后第二个线程启动一个小内核。
我订阅了CUPTI_RUNTIME_TRACE_CBID_cudaConfigureCall_v3020和UPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020。
这会导致第二个内核的启动被阻塞,直到第一个线程完成cudaDeviceSynchronize()。第二个线程不会从cudaConfigureCall()返回,直到第一个线程完成cudaDeviceSynchronize()。
如果我不订阅CUPTI,则不会发生这种情况。这看起来像是CUPTI的一个讨厌的性能错误。
下面的调用堆栈显示了每个线程的状态。我在这篇文章中附上了代码。
(gdb) info threads
4 Thread 0x7f731467c710 (LWP 29708) 0x00000037f4ada083 in select () from /lib64/libc.so.6
3 Thread 0x7f7312b50710 (LWP 29709) 0x00007f7314d7e3a6 in ?? () from /usr/lib64/libcuda.so.1
2 Thread 0x7f731214f710 (LWP 29710) 0x00000037f4ac88d7 in sched_yield () from /lib64/libc.so.6
* 1 Thread 0x7f731477e720 (LWP 29707) 0x00000037f520803d in pthread_join () from /lib64/libpthread.so.0
(gdb) thread 2
[Switching to thread 2 (Thread 0x7f731214f710 (LWP 29710))]#0 0x00000037f4ac88d7 in sched_yield () from /lib64/libc.so.6
(gdb) bt
#0 0x00000037f4ac88d7 in sched_yield () from /lib64/libc.so.6
#1 0x00007f73149fb73c in ?? () from /usr/local/cuda/extras/CUPTI/lib64/libcupti.so.4
#2 0x00007f7314dabac3 in ?? () from /usr/lib64/libcuda.so.1
#3 0x00007f7314db1020 in ?? () from /usr/lib64/libcuda.so.1
#4 0x00007f73147bbee8 in cudaConfigureCall () from /usr/local/cuda/lib64/libcudart.so.4
#5 0x000000000040110f in Thread2 () at event_sampling.cu:121
#6 0x00000037f52077e1 in start_thread () from /lib64/libpthread.so.0
#7 0x00000037f4ae152d in clone () from /lib64/libc.so.6
(gdb) thread 3
[Switching to thread 3 (Thread 0x7f7312b50710 (LWP 29709))]#0 0x00007f7314d7e3a6 in ?? () from /usr/lib64/libcuda.so.1
(gdb) bt
#0 0x00007f7314d7e3a6 in ?? () from /usr/lib64/libcuda.so.1
#1 0x00007f7314d36b5a in ?? () from /usr/lib64/libcuda.so.1
#2 0x00007f7314d08976 in ?? () from /usr/lib64/libcuda.so.1
#3 0x00007f7314d396a3 in ?? () from /usr/lib64/libcuda.so.1
#4 0x00007f7314d39a06 in ?? () from /usr/lib64/libcuda.so.1
#5 0x00007f7314d08a29 in ?? () from /usr/lib64/libcuda.so.1
#6 0x00007f7314cfb830 in ?? () from /usr/lib64/libcuda.so.1
#7 0x00007f7314cdafa4 in ?? () from /usr/lib64/libcuda.so.1
#8 0x00007f731478ea13 in ?? () from /usr/local/cuda/lib64/libcudart.so.4
#9 0x00007f73147c3827 in cudaDeviceSynchronize () from /usr/local/cuda/lib64/libcudart.so.4
#10 0x0000000000400fe2 in Thread1 (ip=0x0) at event_sampling.cu:101
#11 0x00000037f52077e1 in start_thread () from /lib64/libpthread.so.0
#12 0x00000037f4ae152d in clone () from /lib64/libc.so.6
(gdb) thread 4
[Switching to thread 4 (Thread 0x7f731467c710 (LWP 29708))]#0 0x00000037f4ada083 in select () from /lib64/libc.so.6
(gdb) bt
#0 0x00000037f4ada083 in select () from /lib64/libc.so.6
#1 0x00007f731524147b in ?? () from /usr/lib64/libcuda.so.1
#2 0x00007f7314d45d9b in ?? () from /usr/lib64/libcuda.so.1
#3 0x00007f7315242819 in ?? () from /usr/lib64/libcuda.so.1
#4 0x00000037f52077e1 in start_thread () from /lib64/libpthread.so.0
#5 0x00000037f4ae152d in clone () from /lib64/libc.so.6
(gdb)
代码
/*
* Copyright 2011 NVIDIA Corporation. All rights reserved
*
* Sample app to demonstrate use of CUPTI library to obtain profiler
* event values by sampling.
*/
#include <stdio.h>
#include <cuda.h>
#include <cupti.h>
#include <unistd.h>
#include <pthread.h>
#define CHECK_CU_ERROR(err, cufunc) \
if (err != CUDA_SUCCESS) \
{ \
printf ("Error %d for CUDA Driver API function '%s'.\n", \
err, cufunc); \
exit(-1); \
}
#define N 100000
static CUcontext context;
static CUdevice device;
static char *eventName;
// Device code
__global__ void VecAdd(const int* A, const int* B, int* C, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
for(long long m = 0 ; m < 100; m ++)
for(long long n = 0 ; n < 100000 ; n ++)
if (i < size)
C[i] = A[i] + B[i];
}
static void
initVec(int *vec, int n)
{
for (int i=0; i< n; i++)
vec[i] = i;
}
// Device code
__global__ void VecSub(const int* A, const int* B, int* C, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
for(long long n = 0 ; n < 100000 ; n ++)
if (i < size)
C[i] = A[i] - B[i];
}
int *d_A; int *d_B; int *d_C;
cudaStream_t stream[2];
pthread_t threads[2];
static void *
Thread1(void * ip)
{
fprintf(stderr, "\n Thread1 started");
size_t size = N * sizeof(int);
int threadsPerBlock = 0;
int blocksPerGrid = 0;
int sum, i;
int *h_A, *h_B, *h_C;
// Allocate input vectors h_A and h_B in host memory
h_A = (int*)malloc(size);
h_B = (int*)malloc(size);
h_C = (int*)malloc(size);
// Initialize input vectors
initVec(h_A, N);
initVec(h_B, N);
memset(h_C, 0, size);
// Allocate vectors in device memory
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, size);
cudaMalloc((void**)&d_C, size);
// Copy vectors from host memory to device memory
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice,stream[0]);
cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice,stream[0]);
threadsPerBlock = 256;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
fprintf(stderr,"\n Kernel Launch Thread1"); fflush(stderr);
VecAdd<<<blocksPerGrid, threadsPerBlock,0 , stream[0]>>>(d_A, d_B, d_C, N);
fprintf(stderr,"\n Kernel Launched Thread1");fflush(stderr);
fprintf(stderr,"\n Start cudaDeviceSynchronize Thread1");fflush(stderr);
cudaDeviceSynchronize();
fprintf(stderr,"\n End cudaDeviceSynchronize Thread1");fflush(stderr);
return 0;
}
static void * Thread2(void *)
{
sleep(5);
fprintf(stderr,"\n Thread2 started");
size_t size = N * sizeof(int);
int threadsPerBlock = 0;
int blocksPerGrid = 0;
int sum, i;
int *h_A, *h_B, *h_C;
threadsPerBlock = 256;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
fprintf(stderr,"\n Kernel Launch Thread2");fflush(stderr);
VecSub<<<blocksPerGrid, threadsPerBlock,0 , stream[1]>>>(d_A, d_B, d_C, N);
fprintf(stderr,"\n Kernel Launched Thread2");fflush(stderr);
fprintf(stderr,"\n Start cudaDeviceSynchronize Thread2");fflush(stderr);
cudaDeviceSynchronize();
fprintf(stderr,"\n End cudaDeviceSynchronize Thread2");fflush(stderr);
return 0;
}
void CUPTIAPI CallBack(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const void *cbData)
{
uint32_t streamId = 0;
const CUpti_CallbackData * cbInfo = (const CUpti_CallbackData *) cbData;
if(cbid == CUPTI_RUNTIME_TRACE_CBID_cudaConfigureCall_v3020 && cbInfo->callbackSite == CUPTI_API_ENTER) {
fprintf(stderr,"\n Event created");
cudaConfigureCall_v3020_params * params = (cudaConfigureCall_v3020_params *) cbInfo->functionParams;
cuptiGetStreamId(cbInfo->context, (CUstream) params->stream, &streamId);
printf("\n stream %d", streamId);
}
}
int
main(int argc, char *argv[])
{
CUresult err;
cudaStreamCreate(&stream[0]);
cudaStreamCreate(&stream[1]);
#if 1
CUpti_SubscriberHandle subscriber;
cuptiSubscribe(&subscriber, (CUpti_CallbackFunc) CallBack, 0);
cuptiEnableCallback(1,subscriber, CUPTI_CB_DOMAIN_RUNTIME_API, CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020);
cuptiEnableCallback(1,subscriber, CUPTI_CB_DOMAIN_RUNTIME_API, CUPTI_RUNTIME_TRACE_CBID_cudaConfigureCall_v3020);
#endif
cudaDeviceSynchronize();
pthread_create(&threads[0],0,Thread1,0);
pthread_create(&threads[1],0,Thread2,0);
pthread_join(threads[0],0);
pthread_join(threads[1],0);
fprintf(stderr,"\n --------------over -----------");
return 0;
}
答案 0 :(得分:1)
这可能是由于在两个线程中都使用了cudaDeviceSynchronize()
。 cudaDeviceSynchronize()
强制整个设备在任何后续命令继续之前完成所有先前发出的命令。这是一把重锤;谨慎使用它。
我建议在这种情况下使用cudaStreamSynchronize()
。如果您需要一个流在另一个流上等待,请使用cudaEvent
和cudaStreamWaitEvent()
。
如果使用CUPTI_EVENT_COLLECTION_MODE_KERNEL
收集事件,CUPTI将仅禁用并发内核。启用性能分析(无论是通过环境变量,可视化分析器还是此CUPTI模式)都会禁用并发内核执行。
答案 1 :(得分:0)
CUPTI有两种常规模式,即活动收集和事件收集。
通过事件收集,所有内核启动都在整个应用程序中进行序列化。这样做是因为硬件性能计数器的限制要求,为了获得内核的准确测量,只需要在设备上执行该单个内核。
通过活动收集,CUPTI尽可能少地尝试扰乱应用程序行为。目标是尽可能准确地观察GPU的行为。
你是正确的,CUPTI中存在一个错误/限制导致cudaDeviceSync()(和其他同步函数)阻止其他线程上的cuda调用。这是活动收集过程中的一个已知问题(因为它显然打破了低影响观察的主要目标),应该在将来的版本中解决。
此外,并非您的问题,但在其中一个答案中提到,是并发内核执行的问题(即在设备上同时执行两个或多个内核)。 CUPTI会在所有模式下禁用并发内核执行。这也是一个已知问题,将在即将发布的版本中得到解决。