'无效设备序号' (cudaErrorInvalidDevice)在cudaIpcOpenMemHandle上返回

时间:2015-08-19 23:15:45

标签: cuda

当我尝试打开IPC内存句柄时,我得到了cudaErrorInvalidDevice,但我不确定原因。我在CentOS 6.6上使用了四个GTX980。

我在不同进程之间使用共享GPU内存。在一个过程中,我做了:

handle_cuda_error(cudaSetDevice(device_id));
handle_cuda_error(cudaIpcGetMemHandle(&my_ipcmemhandle, (void *)buf_gpu));

在我做的另一个过程中:

handle_cuda_error(cudaSetDevice(device_id));
handle_cuda_error(cudaIpcOpenMemHandle((void **)buf_gpu, my_ipcmemhandle, cudaIpcMemLazyEnablePeerAccess));

获取IPC句柄前设置的设备ID与打开IPC句柄前设置的设备ID相同,设备ID有效。我使用(主机)共享内存来传递两个进程之间的IPC句柄和设备ID。此外,handle_cuda_error会调用cudaGetLastError(),这样我就不会收到之前调用的错误。

当我将cudaIpcMemHandle_t实例视为char数组[64]并打印到屏幕时,我在两个进程中得到相同的结果。

我错过了什么吗?对此特定调用(cudaIpcOpenMemHandle(...))的此错误含义的任何解释都非常有用。

1 个答案:

答案 0 :(得分:1)

事实证明,打开IPC句柄的调用仍然在创建它的过程中,只是在不同的线程中。在这种情况下,对cudaIpcOpenMemHandle的调用将返回cudaErrorInvalidDevice。这个例子证实了这一点:

#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <cuda_runtime.h>

void * run_method(void *arg);
static void handle_cuda_error(const char *pfx, cudaError_t err);
static void print_cuda_ipc_mem_handle(cudaIpcMemHandle_t *ipc_mem_handle);

typedef struct shared_context {
    int device_id;
    cudaIpcMemHandle_t cuipcmemhandle;
    int *shared_gpu;
} shared_context_t;

int shmid;
int verbose = 0;
int use_ipc = 0;

int main(int argc, char **argv) {
int *ptr_local_num;
int *ptr_local_num_gpu;
int local_device_id;
int rv;
char pfx[256];
const int ARRAY_SIZE = 1024*1024*512;
shared_context_t shctx;
pthread_t pthread;

// check verbose flag in arguments list
switch (argc) {
    case 3: {
        if (strcmp("ipc",argv[2]) == 0) {
            use_ipc = 1;
            fprintf(stdout,"Using IPC\n");
        }
        // no break
    }
    case 2: {
        if (strcmp("v",argv[1]) == 0) {
            verbose = 1;
            fprintf(stdout,"Verbose output selected\n");
        }
        break;
    }
}

    // set the device
    shctx.device_id = 0;
    snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaSetDevice(shctx.device_id));
    snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaGetDevice(&local_device_id));
    fprintf(stdout,"main: local_device_id = %d\n",local_device_id);
    // cuda-allocate host memory in parent process
    snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMallocHost(&ptr_local_num,sizeof(int)*ARRAY_SIZE));
    // set the value in host memory
    *ptr_local_num = 42;
    *(ptr_local_num + ARRAY_SIZE - 1) = 42;
    fprintf(stdout,"main: local_num = %d .. %d\n",*ptr_local_num,*(ptr_local_num + ARRAY_SIZE - 1));
    // allocate device memory in parent process
    snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMalloc(&ptr_local_num_gpu,sizeof(int)*ARRAY_SIZE));
    // copy set value in host memory to device memory
    snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMemcpy(ptr_local_num_gpu,ptr_local_num,sizeof(int)*ARRAY_SIZE,cudaMemcpyHostToDevice));
    if (use_ipc) {
        // get cuda-ipc memory handle
        snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaIpcGetMemHandle(&shctx.cuipcmemhandle,ptr_local_num_gpu));
        print_cuda_ipc_mem_handle(&shctx.cuipcmemhandle);
    } else {
        shctx.shared_gpu = ptr_local_num_gpu;
    }
    // launch pthread
    rv = pthread_create(&pthread, NULL, run_method, &shctx);
    if (rv != 0) {      
        char error_strings[4][200] = { "insufficient resources to create another thread, or a system-imposed limit on the number of threads was encountered",
                                    "invalid settings in attr",
                                    "no permission to set the scheduling policy and parameters specified in attr",
                                    "unknown" };
        char *errstr;
        switch(rv) {
        case EAGAIN:
            errstr = error_strings[0];
            break;
        case EINVAL:
            errstr = error_strings[1];
            break;
        case EPERM:
            errstr = error_strings[2];
            break;
        default:
            errstr = error_strings[3];
            break;
        }
        fprintf(stderr,"ERROR, pthread_create returned %d [%s]\n",rv,errstr);
        exit(EXIT_FAILURE);
    }
    // and wait for pthread to finish
    rv = pthread_join(pthread, NULL);
    if (rv != 0) {      
        char error_strings[4][200] = { "a deadlock was detected, or thread specifies the calling thread",
                                    "thread is not a joinable thread OR another thread is already waiting to join with this thread",
                                    "no thread with the ID thread could be found",
                                    "unknown" };
        char *errstr;
        switch(rv) {
        case EDEADLK:
            errstr = error_strings[0];
            break;
        case EINVAL:
            errstr = error_strings[1];
            break;
        case ESRCH:
            errstr = error_strings[2];
            break;
        default:
            errstr = error_strings[3];
            break;
        }
        fprintf(stderr,"ERROR, pthread_create returned %d [%s]\n",rv,errstr);
        exit(EXIT_FAILURE);
    }
    fprintf(stdout,"main: pthread is done\n");
    // ... and then it should be safe to free device memory
    snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaFree((void *)ptr_local_num_gpu));


    return 0;
}

void *run_method(void *arg) {

    int *ptr_local_num;
    int *ptr_local_num_gpu;
    int local_device_id;
    char pfx[256];
    const int ARRAY_SIZE = 1024*1024*512;

    shared_context_t *shctx = (shared_context_t *)arg;

    // cuda-allocate host memory in pthread process for async copy
    snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMallocHost(&ptr_local_num,sizeof(int)*ARRAY_SIZE));
    // set some known (incorrect) value
    *ptr_local_num = -1;
    *(ptr_local_num + ARRAY_SIZE - 1) = -1;
    fprintf(stdout,"pthread: local_num = %d .. %d (before copy)\n",*ptr_local_num,*(ptr_local_num + ARRAY_SIZE - 1));
    // get the device id, and set
    snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaSetDevice(shctx->device_id));
    snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaGetDevice(&local_device_id));
    fprintf(stdout,"pthread: local_device_id = %d\n",local_device_id);
    if (use_ipc) {
        // open the cuda-ipc memory handle
        snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaIpcOpenMemHandle((void **)&ptr_local_num_gpu,shctx->cuipcmemhandle, cudaIpcMemLazyEnablePeerAccess));
        print_cuda_ipc_mem_handle(&shctx->cuipcmemhandle);
    } else {
        // use shared pointer directly
        ptr_local_num_gpu = shctx->shared_gpu;
    }
    // now we can copy data back
    snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaMemcpy(ptr_local_num,ptr_local_num_gpu,sizeof(int)*ARRAY_SIZE,cudaMemcpyDeviceToHost));
    if (use_ipc) {
        // close cuda-ipc memory handle
        snprintf(pfx,256,"%s:%s(%4d):",__FILE__,__FUNCTION__,__LINE__); handle_cuda_error(pfx,cudaIpcCloseMemHandle((void *)ptr_local_num_gpu));
    } else {
        // just reset local pointer
        ptr_local_num_gpu = NULL;
    }
    // value should match the one in the parent process
    fprintf(stdout,"pthread: local_num = %d .. %d (after copy)\n",*ptr_local_num,*(ptr_local_num + ARRAY_SIZE - 1));

    return NULL;
}

static void handle_cuda_error(const char *pfx, cudaError_t err) {
    // make sure the last error was popped
    cudaGetLastError();
    if (err == cudaSuccess && !verbose) {
        return;
    }
    fprintf(stderr,"%scudaError is %d [%s]\n",pfx,(int)err,cudaGetErrorString(err));
}

static void print_cuda_ipc_mem_handle(cudaIpcMemHandle_t *ipc_mem_handle) {
    const int N_BYTES = 64;
    const int MSG_LEN = N_BYTES*2+1;
    char msg[MSG_LEN];
    int ii;
    for (ii=0; ii<N_BYTES; ii++) {
        int offset = ii==0 ? 0 : 1;
        snprintf(msg+2*ii-offset,MSG_LEN-2*ii,"%02x",*((char *)ipc_mem_handle + ii));
    }
    fprintf(stdout,"%s:%s(%3d): ipc_mem_handle = [%s]\n",__FILE__,__FUNCTION__,__LINE__,msg);
}

使用IPC时,程序在尝试打开IPC句柄时会报告此错误:

multithread.cu:run_method( 156):cudaError is 10 [invalid device ordinal]