为什么相同显卡的opencl代码存在基准差异但是在不同的机器上呢?

时间:2014-04-03 07:23:09

标签: c opencl gpu

我正在使用显卡 Nvidia GeForce GT 630 。并运行opencl程序。程序收集double数组并作为参数传递给gpu内核作为缓冲区。使用此数组进行计算的位置。执行工作大约需要10分钟。 机器我正在使用英特尔(R)Core(TM)i5-3470 CPU

类似的统计数据,

机器:英特尔(R)Core(TM)i3 CPU

显卡: GeForce 9500 GT

同一节目所花费的时间 16分钟

我认为时间可能是因为不同的显卡。因此,我用Intel(R)Core(TM)i5-3470 CPU取代了英特尔(R)Core(TM)i3 CPU中的GeForce 9500 GT。

但仍然是16分钟。由于程序时CPU和GPU之间没有连接。任何人都可以建议我,在低端机器上安装高端显卡后,为什么没有时间上的改进,因为一切都只在同一个GPU上计算?

提前感谢。

创建内核的代码:

void create_kernel () {
    FILE *fp;
    char *source_str;
    fp = fopen("cw_calc.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);
    cl_uint count;
    clGetPlatformIDs(10, NULL, &count);
    // get all platforms in array platfroms
    platform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * count);
    clGetPlatformIDs(count, platform, NULL);

        ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    if (ret != 0) {
        printf("clGetPlatformIDs error: %d. couldn't load\n", ret);
        exit(1);
    }

    //////////////////////////////////////////////////////////////////////////////////////
        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
    if (ret != 0) {
        printf("clGetDeviceIDs error: %d. couldn't load\n", ret);
        exit(1);
    }
        /* Create OpenCL context */
        context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
    if (ret != 0) {
        printf("clCreateContext error: %d. couldn't load\n", ret);
        exit(1);
    }
        /* Create Command Queue */
        command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
    if (ret != 0) {
        printf("clCreateCommandQueue error: %d. couldn't load\n", ret);
        exit(1);
    }
        /*Initialization complete*/

        /* Create Kernel Program from the source */
        program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret);
    if (ret != 0) {
        printf("clCreateProgramWithSource error: %d. couldn't load\n", ret);
        exit(1);
    }
        /* Build Kernel Program */
    printf("loading GPU kernel..\n");
    system("date");
        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    system("date");
    if (ret != 0) {
        printf("clBuildProgram error: %d. couldn't load\n", ret);
        // Determine the size of the log
        size_t log_size;
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

        // Allocate memory for the log
        char *log = (char *) malloc(log_size);

        // Get the log
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

        // Print the log
        printf("%s\n", log);
        exit(1);
    }
    printf("GPU kernel loaded successfuly..\n");
        /* Create OpenCL Kernel */
    printf("creating kernel program..\n");
        kernel = clCreateKernel(program, "gpu_solve", &ret);
    if (ret != 0) {
        printf("clCreateKernel error: %d. couldn't load\n", ret);
        exit(1);
    }
    printf("kernel program created successfuly..\n");
}

执行内核的代码:

int gpu_solve()
{
    printf("Calling gpu_solve\n");
    cl_int ret;
    cl_event event;
    cl_mem spcmBuffer,pvpmBuffer, frmBuffer, ipcmBuffer;;
    size_t global_work_size[1] = {1};
    double sending = 0.0, recv = 0.0, calctime = 0.0;
    double temp = 0.0;
    struct timezone tz;
    struct timeval curr_time;
    gettimeofday(&curr_time, &tz);
    temp = (curr_time.tv_usec/1000000.0) + curr_time.tv_sec;
    spcmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numsp * sizeof(double),(void *) cti_spcm, &ret);
    if (ret != 0) {
        printf("clCreateBuffer spcmBuffer error: %d. couldn't load\n", ret);
        exit(1);
    }
    pvpmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numparams * sizeof(double),(void *) cti_pvpm, &ret);
    if (ret != 0) {
        printf("clCreateBuffer pvpmBuffer error: %d. couldn't load\n", ret);
        exit(1);
    }
    frmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numreactions * sizeof(double),(void *) cti_frm, &ret);
    if (ret != 0) {
        printf("clCreateBuffer frmBuffer error: %d. couldn't load\n", ret);
        exit(1);
    }
    ipcmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numsim_spc * sizeof(double),(void *) cti_ipcm, &ret);
    if (ret != 0) {
        printf("clCreateBuffer ipcmBuffer error: %d. couldn't load\n", ret);
        exit(1);
    }
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&ipcmBuffer);
    if (ret != 0) {
        printf("clSetKernelArg 0 error: %d. couldn't load\n", ret);
        exit(1);
    }
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&spcmBuffer);
    if (ret != 0) {
        printf("clSetKernelArg 1 error: %d. couldn't load\n", ret);
        exit(1);
    }
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&pvpmBuffer);
    if (ret != 0) {
        printf("clSetKernelArg 2 error: %d. couldn't load\n", ret);
        exit(1);
    }
    ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&frmBuffer);
    if (ret != 0) {
        printf("clSetKernelArg 3 error: %d. couldn't load\n", ret);
        exit(1);
    }
    ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&ct_numsim_spc);
    if (ret != 0) {
        printf("clSetKernelArg 4 error: %d. couldn't load\n", ret);
        exit(1);
    }

    ret = clSetKernelArg(kernel, 5, sizeof(double), (void *)&ct_deltime);
    if (ret != 0) {
        printf("clSetKernelArg 5 error: %d. couldn't load\n", ret);
        exit(1);
    }
    gettimeofday(&curr_time, &tz);
    sending = ((curr_time.tv_usec/1000000.0) + curr_time.tv_sec) - temp;
    printf("sending time %lf\n",sending);
    gettimeofday(&curr_time, &tz);
    temp = (curr_time.tv_usec/1000000.0) + curr_time.tv_sec;
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &event);
    if (ret != 0) {
        printf("clEnqueueNDRangeKernel error: %d. couldn't load\n", ret);
        exit(1);
    }
    clWaitForEvents(1, &event);
    gettimeofday(&curr_time, &tz);
    calctime = ((curr_time.tv_usec/1000000.0) + curr_time.tv_sec) - temp;
    printf("calctime time %lf\n",calctime);
    gettimeofday(&curr_time, &tz);
    temp = (curr_time.tv_usec/1000000.0) + curr_time.tv_sec;
    ret = clEnqueueReadBuffer(command_queue, ipcmBuffer, CL_TRUE, 0, numsim_spc * sizeof(double), cti_ipcm, 0, NULL, NULL);
    if (ret != 0) {
        printf("clEnqueueReadBuffer ipcmBuffer error: %d. couldn't load\n", ret);
        exit(1);
    }
    ret = clEnqueueReadBuffer(command_queue, spcmBuffer, CL_TRUE, 0, numsp * sizeof(double), cti_spcm, 0, NULL, NULL);
    if (ret != 0) {
        printf("clEnqueueReadBuffer spcmBuffer error: %d. couldn't load\n", ret);
        exit(1);
    }
    ret = clEnqueueReadBuffer(command_queue, pvpmBuffer, CL_TRUE, 0, numparams * sizeof(double), cti_pvpm, 0, NULL, NULL);
    if (ret != 0) {
        printf("clEnqueueReadBuffer pvpmBuffer error: %d. couldn't load\n", ret);
        exit(1);
    }
    ret = clEnqueueReadBuffer(command_queue, frmBuffer, CL_TRUE, 0, numreactions * sizeof(double), cti_frm, 0, NULL, NULL);
    if (ret != 0) {
        printf("clEnqueueReadBuffer frmBuffer error: %d. couldn't load\n", ret);
        exit(1);
    }
    ret = clReleaseMemObject(spcmBuffer);
    ret = clReleaseMemObject(pvpmBuffer);
    ret = clReleaseMemObject(frmBuffer);
    ret = clReleaseMemObject(ipcmBuffer);
    gettimeofday(&curr_time, &tz);
    recv = ((curr_time.tv_usec/1000000.0) + curr_time.tv_sec) - temp;
    printf("recv time %lf\n",recv);
    return 0;
}

内核代码:

void calc_ipcm_cc (__global double* cti_ipcm, __global double* cti_fprm) {
    cti_ipcm[0] = (-cti_fprm[605] + cti_fprm[3135])*5.000000e-01;
    cti_ipcm[1] = (cti_fprm[132] - cti_fprm[1037] + cti_fprm[6734])*1.004016e+01;
    cti_ipcm[2] = (cti_fprm[3993] - cti_fprm[4090])*5.000000e-01;
    .....
    ....
    ~10000 equations to be solved
}

void gpu_eval (__global double* cti_spcm, __global double* cti_pvpm, __global double* cti_fprm) {
    cti_fprm[7632] = ((1.00000000e+00*cti_spcm[2986])/(1.00000000e+00+cti_spcm[2986])) ;
    cti_pvpm[6208] = (((1.00000000e+00*cti_spcm[2986])*1.66000000e+00) );
    cti_pvpm[4434] = (((cti_pvpm[6208]*cti_spcm[212])/(1.00000000e+00+cti_spcm[212])) );
    cti_fprm[7633] = cti_pvpm[4434] ;
    .....
    ....
    ~10000 equations to be solved
}

__kernel void gpu_solve(__global double* cti_ipcm,__global double* cti_spcm, __global double* cti_pvpm, __global double* cti_fprm, int ct_numsim_spc, double ct_deltime)
{
    int i = 0, ispc, mnspcidx;
    ct_deltime = 0.00001;
    double simtime = 0.0;
    while (simtime <= 50000.0) {
        calc_ipcm(cti_ipcm,cti_fprm);
        gpu_eval(cti_spcm,cti_pvpm,cti_fprm);
        simtime = simtime + ct_deltime;
    }

}

1 个答案:

答案 0 :(得分:0)

您正在使用CL_DEVICE_TYPE_DEFAULT,我很确定您使用的是CPU而不是GPU。 因为如果只需要16M来完成1个内核,GPU驱动程序就会重新启动。