cudaEventQuery()块

时间:2012-02-12 18:42:04

标签: cuda

我在主程序的周期性ITIMER回调寄存器中调用cudaEventQuery()。 线程在cudaDeviceSynchronize()等待GPU内核完成。

我看到cudaEventQuery()没有返回并被阻止。

当cudaEventQuery()卡住时,我已将程序附加到此文件和callstack。

我很感激有关消除此问题/错误的任何信息/帮助。


配置

Nvidia Tesla 2070 GPU上的CUDA 4.1。


我的节目

#include <stdio.h>
#include <cuda.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>


#define CHECK_CU_ERROR(err, cufunc)                                     \
    if (err != CUDA_SUCCESS)                                              \
{                                                                   \
    printf ("Error %d for CUDA Driver API function '%s'.\n",          \
            err, cufunc);                                             \
    exit(-1);                                                         \
}


#define N 100000


static CUcontext context;
static CUdevice device;
cudaEvent_t event;

void event_handler(int signum)
{
    printf("\n Timer triggered!");
    if (cudaEventQuery(event) == cudaSuccess) {
    printf("\n Event finished");
    fflush(stdout);
    } else {
    printf("\n Event NOT finished");
    fflush(stdout);
    }
}

// Device code
__global__ void VecAdd(const int *A, const int *B, int *C, int size)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    for (long long m = 0; m < 10; m++)
    for (long long n = 0; n < 100000; n++)
        if (i < size)
        C[i] = A[i] + B[i];
}

static void initVec(int *vec, int n)
{
    for (int i = 0; i < n; i++)
    vec[i] = i;
}



int *d_A;
int *d_B;
int *d_C;


static void *compute(void *ip)
{
    size_t size = N * sizeof(int);
    int threadsPerBlock = 0;
    int blocksPerGrid = 0;
    int *h_A, *h_B, *h_C;
    //int id = (int) pthread_self() + 1;


    // Allocate input vectors h_A and h_B in host memory
    h_A = (int *) malloc(size);
    h_B = (int *) malloc(size);
    h_C = (int *) malloc(size);

    // Initialize input vectors
    initVec(h_A, N);
    initVec(h_B, N);
    memset(h_C, 0, size);

    // Allocate vectors in device memory
    cudaMalloc((void **) &d_A, size);
    cudaMalloc((void **) &d_B, size);
    cudaMalloc((void **) &d_C, size);

    // Copy vectors from host memory to device memory
    cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice);

    threadsPerBlock = 256;
    blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    VecAdd<<<blocksPerGrid, threadsPerBlock,0>>>(d_A, d_B, d_C, N);

    cudaEventCreate(&event);
    cudaEventRecord(event);
    printf("\n Record");
    fflush(stdout);

    struct sigaction sa;
    struct itimerval timer;
    memset(&sa, 0, sizeof(sa));
    sa.sa_handler = &event_handler;
    sigaction(SIGALRM, &sa, NULL);
    timer.it_value.tv_sec = 0;
    timer.it_value.tv_usec = 250;
    timer.it_interval.tv_sec = 1;
    timer.it_interval.tv_usec = 250;
    setitimer(ITIMER_REAL, &timer, NULL);
    return 0;
}

int main(int argc, char *argv[])
{
    CUresult err;

    int deviceNum = 0;
    int deviceCount = 0;
#if 0
    // Try different flags
    if (cudaSetDeviceFlags(cudaDeviceScheduleSpin) != cudaSuccess) {
    printf("\n failed cudaSetDeviceFlags");
    exit(-1);
    }
#endif

    err = cuInit(0);
    CHECK_CU_ERROR(err, "cuInit");

    err = cuDeviceGetCount(&deviceCount);
    CHECK_CU_ERROR(err, "cuDeviceGetCount");

    if (deviceCount == 0) {
    printf("There is no device supporting CUDA.\n");
    exit(-1);
    }


    err = cuDeviceGet(&device, deviceNum);
    CHECK_CU_ERROR(err, "cuDeviceGet");


    err = cuCtxCreate(&context, 0, device);
    CHECK_CU_ERROR(err, "cuCtxCreate");


    compute(0);
    cudaDeviceSynchronize();
    printf("\n SYNCed");
    while (1);
    return 0;
}

CALLSTACK cudaEventQuery被阻止

(gdb) bt
#0 0x00000037f520e034 in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00000037f5209345 in _L_lock_868 () from /lib64/libpthread.so.0
#2 0x00000037f5209217 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x00007f7bb6fd75b7 in ?? () from /usr/lib64/libcuda.so.1
#4 0x00007f7bb6fd575a in ?? () from /usr/lib64/libcuda.so.1
#5 0x00007f7bb70062e3 in ?? () from /usr/lib64/libcuda.so.1
#6 0x00007f7bb700c3ec in ?? () from /usr/lib64/libcuda.so.1
#7 0x00007f7bb6fc95d8 in ?? () from /usr/lib64/libcuda.so.1
#8 0x00007f7bb6fb9c35 in ?? () from /usr/lib64/libcuda.so.1
#9 0x00007f7bb6a5ad57 in ?? () from /usr/local/cuda/lib64/libcudart.so.4
#10 0x00007f7bb6a8c4f2 in cudaEventQuery () from /usr/local/cuda/lib64/libcudart.so.4
#11 0x0000000000400e8d in event_handler (signum=14) at event_sampling.cu:40
#12 <signal handler called>
#13 0x00007f7bb7003791 in ?? () from /usr/lib64/libcuda.so.1
#14 0x00007f7bb6fd5786 in ?? () from /usr/lib64/libcuda.so.1
#15 0x00007f7bb70062e3 in ?? () from /usr/lib64/libcuda.so.1
#16 0x00007f7bb7006646 in ?? () from /usr/lib64/libcuda.so.1
#17 0x00007f7bb6fd5839 in ?? () from /usr/lib64/libcuda.so.1
#18 0x00007f7bb6fc86e0 in ?? () from /usr/lib64/libcuda.so.1
#19 0x00007f7bb6fa7d62 in ?? () from /usr/lib64/libcuda.so.1
#20 0x00007f7bb6a5e9d3 in ?? () from /usr/local/cuda/lib64/libcudart.so.4
#21 0x00007f7bb6a9318c in cudaDeviceSynchronize () from /usr/local/cuda/lib64/libcudart.so.4
#22 0x00000000004012b3 in main (argc=1, argv=0x7fff20bff048) at event_sampling.cu:157
(gdb) 

删除驱动程序API

后,代码相同
    /*
 * Copyright 2011 NVIDIA Corporation. All rights reserved
 *
 * Sample app to demonstrate use of CUPTI library to obtain profiler
 * event values by sampling.
 */



#include <stdio.h>
#include <cuda.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>


#define CHECK_CU_ERROR(err, cufunc)                                     \
    if (err != CUDA_SUCCESS)                                              \
{                                                                   \
    printf ("Error %d for CUDA Driver API function '%s'.\n",          \
            err, cufunc);                                             \
    exit(-1);                                                         \
}


#define N 100000


cudaEvent_t event;
void event_handler(int signum)
{
    printf("\n Timer triggered!");

    if (cudaEventQuery(event) == cudaSuccess) {
    printf("\n Event finished");
    fflush(stdout);
    } else {
    printf("\n Event NOT finished");
    fflush(stdout);
    }
}

// Device code
__global__ void VecAdd(const int *A, const int *B, int *C, int size)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    for (long long m = 0; m < 10; m++)
    for (long long n = 0; n < 100000; n++)
        if (i < size)
        C[i] = A[i] + B[i];
}

static void initVec(int *vec, int n)
{
    for (int i = 0; i < n; i++)
    vec[i] = i;
}



int *d_A;
int *d_B;
int *d_C;


static void *compute(void *ip)
{
    size_t size = N * sizeof(int);
    int threadsPerBlock = 0;
    int blocksPerGrid = 0;
    int *h_A, *h_B, *h_C;
    //int id = (int) pthread_self() + 1;


    // Allocate input vectors h_A and h_B in host memory
    h_A = (int *) malloc(size);
    h_B = (int *) malloc(size);
    h_C = (int *) malloc(size);

    // Initialize input vectors
    initVec(h_A, N);
    initVec(h_B, N);
    memset(h_C, 0, size);

    // Allocate vectors in device memory
    cudaMalloc((void **) &d_A, size);
    cudaMalloc((void **) &d_B, size);
    cudaMalloc((void **) &d_C, size);

    // Copy vectors from host memory to device memory
    cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice);

    threadsPerBlock = 256;
    blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;


    VecAdd<<<blocksPerGrid, threadsPerBlock,0>>>(d_A, d_B, d_C, N);

    cudaEventCreate(&event);
    cudaEventRecord(event);

    printf("\n Record");
    fflush(stdout);

    struct sigaction sa;
    struct itimerval timer;
    memset(&sa, 0, sizeof(sa));
    sa.sa_handler = &event_handler;
    sigaction(SIGALRM, &sa, NULL);
    timer.it_value.tv_sec = 0;
    timer.it_value.tv_usec = 250;
    timer.it_interval.tv_sec = 1;
    timer.it_interval.tv_usec = 250;
    setitimer(ITIMER_REAL, &timer, NULL);
    return 0;
}

int main(int argc, char *argv[])
{
    CUresult err;

#if 0
    // Try different flags
    if (cudaSetDeviceFlags(cudaDeviceScheduleSpin) != cudaSuccess) {
    printf("\n failed cudaSetDeviceFlags");
    exit(-1);
    }
#endif
    compute(0);
    cudaDeviceSynchronize();
    printf("\n SYNCed");
    fflush(stdout);
    while (1)sleep(10);
    return 0;
}

1 个答案:

答案 0 :(得分:1)

您有一个启动内核的线程,在内核之后调度要访问的事件,然后调用cudaDeviceSynchronize()。当到达您的信号处理程序时,它会尝试调用另一个阻塞的CUDA API调用。

你忽略了信号处理程序的一个基本原则,即do as little as possible

真正的问题是你究竟想要实现的目标是什么?你可以等待事件(cudaEventSynchronize())来完成你在这里所做的事情,但如果你的目标更复杂,你应该更详细地思考如何实现它,信号处理程序不是正确的方法