我正在使用显卡 Nvidia GeForce GT 630 。并运行opencl程序。程序收集double数组并作为参数传递给gpu内核作为缓冲区。使用此数组进行计算的位置。执行工作大约需要10分钟。 机器我正在使用英特尔(R)Core(TM)i5-3470 CPU
类似的统计数据,
机器:英特尔(R)Core(TM)i3 CPU
显卡: GeForce 9500 GT
同一节目所花费的时间 16分钟。
我认为时间可能是因为不同的显卡。因此,我用Intel(R)Core(TM)i5-3470 CPU取代了英特尔(R)Core(TM)i3 CPU中的GeForce 9500 GT。
但仍然是16分钟。由于程序时CPU和GPU之间没有连接。任何人都可以建议我,在低端机器上安装高端显卡后,为什么没有时间上的改进,因为一切都只在同一个GPU上计算?
提前感谢。
创建内核的代码:
void create_kernel () {
FILE *fp;
char *source_str;
fp = fopen("cw_calc.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
cl_uint count;
clGetPlatformIDs(10, NULL, &count);
// get all platforms in array platfroms
platform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * count);
clGetPlatformIDs(count, platform, NULL);
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
if (ret != 0) {
printf("clGetPlatformIDs error: %d. couldn't load\n", ret);
exit(1);
}
//////////////////////////////////////////////////////////////////////////////////////
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
if (ret != 0) {
printf("clGetDeviceIDs error: %d. couldn't load\n", ret);
exit(1);
}
/* Create OpenCL context */
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
if (ret != 0) {
printf("clCreateContext error: %d. couldn't load\n", ret);
exit(1);
}
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
if (ret != 0) {
printf("clCreateCommandQueue error: %d. couldn't load\n", ret);
exit(1);
}
/*Initialization complete*/
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret);
if (ret != 0) {
printf("clCreateProgramWithSource error: %d. couldn't load\n", ret);
exit(1);
}
/* Build Kernel Program */
printf("loading GPU kernel..\n");
system("date");
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
system("date");
if (ret != 0) {
printf("clBuildProgram error: %d. couldn't load\n", ret);
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
exit(1);
}
printf("GPU kernel loaded successfuly..\n");
/* Create OpenCL Kernel */
printf("creating kernel program..\n");
kernel = clCreateKernel(program, "gpu_solve", &ret);
if (ret != 0) {
printf("clCreateKernel error: %d. couldn't load\n", ret);
exit(1);
}
printf("kernel program created successfuly..\n");
}
执行内核的代码:
int gpu_solve()
{
printf("Calling gpu_solve\n");
cl_int ret;
cl_event event;
cl_mem spcmBuffer,pvpmBuffer, frmBuffer, ipcmBuffer;;
size_t global_work_size[1] = {1};
double sending = 0.0, recv = 0.0, calctime = 0.0;
double temp = 0.0;
struct timezone tz;
struct timeval curr_time;
gettimeofday(&curr_time, &tz);
temp = (curr_time.tv_usec/1000000.0) + curr_time.tv_sec;
spcmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numsp * sizeof(double),(void *) cti_spcm, &ret);
if (ret != 0) {
printf("clCreateBuffer spcmBuffer error: %d. couldn't load\n", ret);
exit(1);
}
pvpmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numparams * sizeof(double),(void *) cti_pvpm, &ret);
if (ret != 0) {
printf("clCreateBuffer pvpmBuffer error: %d. couldn't load\n", ret);
exit(1);
}
frmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numreactions * sizeof(double),(void *) cti_frm, &ret);
if (ret != 0) {
printf("clCreateBuffer frmBuffer error: %d. couldn't load\n", ret);
exit(1);
}
ipcmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numsim_spc * sizeof(double),(void *) cti_ipcm, &ret);
if (ret != 0) {
printf("clCreateBuffer ipcmBuffer error: %d. couldn't load\n", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&ipcmBuffer);
if (ret != 0) {
printf("clSetKernelArg 0 error: %d. couldn't load\n", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&spcmBuffer);
if (ret != 0) {
printf("clSetKernelArg 1 error: %d. couldn't load\n", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&pvpmBuffer);
if (ret != 0) {
printf("clSetKernelArg 2 error: %d. couldn't load\n", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&frmBuffer);
if (ret != 0) {
printf("clSetKernelArg 3 error: %d. couldn't load\n", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&ct_numsim_spc);
if (ret != 0) {
printf("clSetKernelArg 4 error: %d. couldn't load\n", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 5, sizeof(double), (void *)&ct_deltime);
if (ret != 0) {
printf("clSetKernelArg 5 error: %d. couldn't load\n", ret);
exit(1);
}
gettimeofday(&curr_time, &tz);
sending = ((curr_time.tv_usec/1000000.0) + curr_time.tv_sec) - temp;
printf("sending time %lf\n",sending);
gettimeofday(&curr_time, &tz);
temp = (curr_time.tv_usec/1000000.0) + curr_time.tv_sec;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &event);
if (ret != 0) {
printf("clEnqueueNDRangeKernel error: %d. couldn't load\n", ret);
exit(1);
}
clWaitForEvents(1, &event);
gettimeofday(&curr_time, &tz);
calctime = ((curr_time.tv_usec/1000000.0) + curr_time.tv_sec) - temp;
printf("calctime time %lf\n",calctime);
gettimeofday(&curr_time, &tz);
temp = (curr_time.tv_usec/1000000.0) + curr_time.tv_sec;
ret = clEnqueueReadBuffer(command_queue, ipcmBuffer, CL_TRUE, 0, numsim_spc * sizeof(double), cti_ipcm, 0, NULL, NULL);
if (ret != 0) {
printf("clEnqueueReadBuffer ipcmBuffer error: %d. couldn't load\n", ret);
exit(1);
}
ret = clEnqueueReadBuffer(command_queue, spcmBuffer, CL_TRUE, 0, numsp * sizeof(double), cti_spcm, 0, NULL, NULL);
if (ret != 0) {
printf("clEnqueueReadBuffer spcmBuffer error: %d. couldn't load\n", ret);
exit(1);
}
ret = clEnqueueReadBuffer(command_queue, pvpmBuffer, CL_TRUE, 0, numparams * sizeof(double), cti_pvpm, 0, NULL, NULL);
if (ret != 0) {
printf("clEnqueueReadBuffer pvpmBuffer error: %d. couldn't load\n", ret);
exit(1);
}
ret = clEnqueueReadBuffer(command_queue, frmBuffer, CL_TRUE, 0, numreactions * sizeof(double), cti_frm, 0, NULL, NULL);
if (ret != 0) {
printf("clEnqueueReadBuffer frmBuffer error: %d. couldn't load\n", ret);
exit(1);
}
ret = clReleaseMemObject(spcmBuffer);
ret = clReleaseMemObject(pvpmBuffer);
ret = clReleaseMemObject(frmBuffer);
ret = clReleaseMemObject(ipcmBuffer);
gettimeofday(&curr_time, &tz);
recv = ((curr_time.tv_usec/1000000.0) + curr_time.tv_sec) - temp;
printf("recv time %lf\n",recv);
return 0;
}
内核代码:
void calc_ipcm_cc (__global double* cti_ipcm, __global double* cti_fprm) {
cti_ipcm[0] = (-cti_fprm[605] + cti_fprm[3135])*5.000000e-01;
cti_ipcm[1] = (cti_fprm[132] - cti_fprm[1037] + cti_fprm[6734])*1.004016e+01;
cti_ipcm[2] = (cti_fprm[3993] - cti_fprm[4090])*5.000000e-01;
.....
....
~10000 equations to be solved
}
void gpu_eval (__global double* cti_spcm, __global double* cti_pvpm, __global double* cti_fprm) {
cti_fprm[7632] = ((1.00000000e+00*cti_spcm[2986])/(1.00000000e+00+cti_spcm[2986])) ;
cti_pvpm[6208] = (((1.00000000e+00*cti_spcm[2986])*1.66000000e+00) );
cti_pvpm[4434] = (((cti_pvpm[6208]*cti_spcm[212])/(1.00000000e+00+cti_spcm[212])) );
cti_fprm[7633] = cti_pvpm[4434] ;
.....
....
~10000 equations to be solved
}
__kernel void gpu_solve(__global double* cti_ipcm,__global double* cti_spcm, __global double* cti_pvpm, __global double* cti_fprm, int ct_numsim_spc, double ct_deltime)
{
int i = 0, ispc, mnspcidx;
ct_deltime = 0.00001;
double simtime = 0.0;
while (simtime <= 50000.0) {
calc_ipcm(cti_ipcm,cti_fprm);
gpu_eval(cti_spcm,cti_pvpm,cti_fprm);
simtime = simtime + ct_deltime;
}
}
答案 0 :(得分:0)
您正在使用CL_DEVICE_TYPE_DEFAULT,我很确定您使用的是CPU而不是GPU。 因为如果只需要16M来完成1个内核,GPU驱动程序就会重新启动。