我在主程序的周期性ITIMER回调寄存器中调用cudaEventQuery()。 线程在cudaDeviceSynchronize()等待GPU内核完成。
我看到cudaEventQuery()没有返回并被阻止。
当cudaEventQuery()卡住时,我已将程序附加到此文件和callstack。
我很感激有关消除此问题/错误的任何信息/帮助。
Nvidia Tesla 2070 GPU上的CUDA 4.1。
#include <stdio.h>
#include <cuda.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define CHECK_CU_ERROR(err, cufunc) \
if (err != CUDA_SUCCESS) \
{ \
printf ("Error %d for CUDA Driver API function '%s'.\n", \
err, cufunc); \
exit(-1); \
}
#define N 100000
static CUcontext context;
static CUdevice device;
cudaEvent_t event;
void event_handler(int signum)
{
printf("\n Timer triggered!");
if (cudaEventQuery(event) == cudaSuccess) {
printf("\n Event finished");
fflush(stdout);
} else {
printf("\n Event NOT finished");
fflush(stdout);
}
}
// Device code
__global__ void VecAdd(const int *A, const int *B, int *C, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
for (long long m = 0; m < 10; m++)
for (long long n = 0; n < 100000; n++)
if (i < size)
C[i] = A[i] + B[i];
}
static void initVec(int *vec, int n)
{
for (int i = 0; i < n; i++)
vec[i] = i;
}
int *d_A;
int *d_B;
int *d_C;
static void *compute(void *ip)
{
size_t size = N * sizeof(int);
int threadsPerBlock = 0;
int blocksPerGrid = 0;
int *h_A, *h_B, *h_C;
//int id = (int) pthread_self() + 1;
// Allocate input vectors h_A and h_B in host memory
h_A = (int *) malloc(size);
h_B = (int *) malloc(size);
h_C = (int *) malloc(size);
// Initialize input vectors
initVec(h_A, N);
initVec(h_B, N);
memset(h_C, 0, size);
// Allocate vectors in device memory
cudaMalloc((void **) &d_A, size);
cudaMalloc((void **) &d_B, size);
cudaMalloc((void **) &d_C, size);
// Copy vectors from host memory to device memory
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice);
threadsPerBlock = 256;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock,0>>>(d_A, d_B, d_C, N);
cudaEventCreate(&event);
cudaEventRecord(event);
printf("\n Record");
fflush(stdout);
struct sigaction sa;
struct itimerval timer;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = &event_handler;
sigaction(SIGALRM, &sa, NULL);
timer.it_value.tv_sec = 0;
timer.it_value.tv_usec = 250;
timer.it_interval.tv_sec = 1;
timer.it_interval.tv_usec = 250;
setitimer(ITIMER_REAL, &timer, NULL);
return 0;
}
int main(int argc, char *argv[])
{
CUresult err;
int deviceNum = 0;
int deviceCount = 0;
#if 0
// Try different flags
if (cudaSetDeviceFlags(cudaDeviceScheduleSpin) != cudaSuccess) {
printf("\n failed cudaSetDeviceFlags");
exit(-1);
}
#endif
err = cuInit(0);
CHECK_CU_ERROR(err, "cuInit");
err = cuDeviceGetCount(&deviceCount);
CHECK_CU_ERROR(err, "cuDeviceGetCount");
if (deviceCount == 0) {
printf("There is no device supporting CUDA.\n");
exit(-1);
}
err = cuDeviceGet(&device, deviceNum);
CHECK_CU_ERROR(err, "cuDeviceGet");
err = cuCtxCreate(&context, 0, device);
CHECK_CU_ERROR(err, "cuCtxCreate");
compute(0);
cudaDeviceSynchronize();
printf("\n SYNCed");
while (1);
return 0;
}
(gdb) bt
#0 0x00000037f520e034 in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00000037f5209345 in _L_lock_868 () from /lib64/libpthread.so.0
#2 0x00000037f5209217 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x00007f7bb6fd75b7 in ?? () from /usr/lib64/libcuda.so.1
#4 0x00007f7bb6fd575a in ?? () from /usr/lib64/libcuda.so.1
#5 0x00007f7bb70062e3 in ?? () from /usr/lib64/libcuda.so.1
#6 0x00007f7bb700c3ec in ?? () from /usr/lib64/libcuda.so.1
#7 0x00007f7bb6fc95d8 in ?? () from /usr/lib64/libcuda.so.1
#8 0x00007f7bb6fb9c35 in ?? () from /usr/lib64/libcuda.so.1
#9 0x00007f7bb6a5ad57 in ?? () from /usr/local/cuda/lib64/libcudart.so.4
#10 0x00007f7bb6a8c4f2 in cudaEventQuery () from /usr/local/cuda/lib64/libcudart.so.4
#11 0x0000000000400e8d in event_handler (signum=14) at event_sampling.cu:40
#12 <signal handler called>
#13 0x00007f7bb7003791 in ?? () from /usr/lib64/libcuda.so.1
#14 0x00007f7bb6fd5786 in ?? () from /usr/lib64/libcuda.so.1
#15 0x00007f7bb70062e3 in ?? () from /usr/lib64/libcuda.so.1
#16 0x00007f7bb7006646 in ?? () from /usr/lib64/libcuda.so.1
#17 0x00007f7bb6fd5839 in ?? () from /usr/lib64/libcuda.so.1
#18 0x00007f7bb6fc86e0 in ?? () from /usr/lib64/libcuda.so.1
#19 0x00007f7bb6fa7d62 in ?? () from /usr/lib64/libcuda.so.1
#20 0x00007f7bb6a5e9d3 in ?? () from /usr/local/cuda/lib64/libcudart.so.4
#21 0x00007f7bb6a9318c in cudaDeviceSynchronize () from /usr/local/cuda/lib64/libcudart.so.4
#22 0x00000000004012b3 in main (argc=1, argv=0x7fff20bff048) at event_sampling.cu:157
(gdb)
/*
* Copyright 2011 NVIDIA Corporation. All rights reserved
*
* Sample app to demonstrate use of CUPTI library to obtain profiler
* event values by sampling.
*/
#include <stdio.h>
#include <cuda.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define CHECK_CU_ERROR(err, cufunc) \
if (err != CUDA_SUCCESS) \
{ \
printf ("Error %d for CUDA Driver API function '%s'.\n", \
err, cufunc); \
exit(-1); \
}
#define N 100000
cudaEvent_t event;
void event_handler(int signum)
{
printf("\n Timer triggered!");
if (cudaEventQuery(event) == cudaSuccess) {
printf("\n Event finished");
fflush(stdout);
} else {
printf("\n Event NOT finished");
fflush(stdout);
}
}
// Device code
__global__ void VecAdd(const int *A, const int *B, int *C, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
for (long long m = 0; m < 10; m++)
for (long long n = 0; n < 100000; n++)
if (i < size)
C[i] = A[i] + B[i];
}
static void initVec(int *vec, int n)
{
for (int i = 0; i < n; i++)
vec[i] = i;
}
int *d_A;
int *d_B;
int *d_C;
static void *compute(void *ip)
{
size_t size = N * sizeof(int);
int threadsPerBlock = 0;
int blocksPerGrid = 0;
int *h_A, *h_B, *h_C;
//int id = (int) pthread_self() + 1;
// Allocate input vectors h_A and h_B in host memory
h_A = (int *) malloc(size);
h_B = (int *) malloc(size);
h_C = (int *) malloc(size);
// Initialize input vectors
initVec(h_A, N);
initVec(h_B, N);
memset(h_C, 0, size);
// Allocate vectors in device memory
cudaMalloc((void **) &d_A, size);
cudaMalloc((void **) &d_B, size);
cudaMalloc((void **) &d_C, size);
// Copy vectors from host memory to device memory
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice);
threadsPerBlock = 256;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock,0>>>(d_A, d_B, d_C, N);
cudaEventCreate(&event);
cudaEventRecord(event);
printf("\n Record");
fflush(stdout);
struct sigaction sa;
struct itimerval timer;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = &event_handler;
sigaction(SIGALRM, &sa, NULL);
timer.it_value.tv_sec = 0;
timer.it_value.tv_usec = 250;
timer.it_interval.tv_sec = 1;
timer.it_interval.tv_usec = 250;
setitimer(ITIMER_REAL, &timer, NULL);
return 0;
}
int main(int argc, char *argv[])
{
CUresult err;
#if 0
// Try different flags
if (cudaSetDeviceFlags(cudaDeviceScheduleSpin) != cudaSuccess) {
printf("\n failed cudaSetDeviceFlags");
exit(-1);
}
#endif
compute(0);
cudaDeviceSynchronize();
printf("\n SYNCed");
fflush(stdout);
while (1)sleep(10);
return 0;
}
答案 0 :(得分:1)
您有一个启动内核的线程,在内核之后调度要访问的事件,然后调用cudaDeviceSynchronize()
。当到达您的信号处理程序时,它会尝试调用另一个阻塞的CUDA API调用。
你忽略了信号处理程序的一个基本原则,即do as little as possible!
真正的问题是你究竟想要实现的目标是什么?你可以等待事件(cudaEventSynchronize()
)来完成你在这里所做的事情,但如果你的目标更复杂,你应该更详细地思考如何实现它,信号处理程序不是正确的方法