Question

我是OpenCL的初学者，我正在尝试运行“OpenLC in Action”一书中的示例代码。我有以下代码来获取我的设备的首选矢量宽度。在我的计算机上检测到的平台来自英特尔酷睿i7和高清显卡，另一个来自NVIDIA GeForce 940M。每当我运行代码时，它为每个类型的向量宽度给出“1”，除非因为不支持类型double为零。即使我在计算机中更改平台以检查其设备，结果也是一样的。我在AMD计算机上运行代码，它似乎正常工作，因为它给了我不同类型的不同数字。但是，我不确定为什么这个代码会在我的计算机的不同平台上为每种类型提供“1”。有任何想法吗？这是输出：

以下是代码：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <CL/cl.h>

int main(){

    cl_int err, i, j;
    cl_platform_id *platforms;
    cl_device_id *devices;
    cl_uint num_platforms, num_devices, vector_width;
    size_t plat_name_size, devi_name_size;
    char *plat_name_data, *devi_name_data;


    err = clGetPlatformIDs(1, NULL, &num_platforms);
    if (err < 0){
        perror("No platform is found");
        exit(1);
    }
    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
    clGetPlatformIDs(num_platforms, platforms, NULL);

    printf("Number of found platforms is %d\n ", num_platforms);

    for (i = 0; i < num_platforms; i++){

        err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &plat_name_size);
        if (err < 0){
            perror("Couldn't read platform name.");
            exit(1);
        }
        plat_name_data = (char*)malloc(plat_name_size);
        clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, plat_name_size, plat_name_data, NULL);
        printf("Platform No.%d is: %s\n", i, plat_name_data);

        err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices);
        if (err < 0){
            perror("No device is found in this platform");
            exit(1);
        }
        devices = (cl_device_id*)malloc(sizeof(cl_device_id)*(num_devices));
        clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
        printf("Number of devices found in this platform is: %d\n", num_devices);
        for (j = 0; j < num_devices; j++){
            err = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &devi_name_size);
            if (err < 0){
                perror("Couldn't read the device name.");
                exit(1);
            }
            devi_name_data = (char*)malloc(devi_name_size);
            clGetDeviceInfo(devices[j], CL_DEVICE_NAME, devi_name_size, devi_name_data, NULL);
            printf("Device No.%d name is: %s\n", j + 1, devi_name_data);
            if (strstr(devi_name_data, "GeForce 940M")){
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, 
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in chars: %u\n", vector_width);
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in shorts: %u\n", vector_width);
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in ints: %u\n", vector_width);
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in longs: %u\n", vector_width);
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in floats: %u\n", vector_width);
                clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                    sizeof(cl_uint), &vector_width, NULL);
                printf("Preferred vector width in doubles: %u\n", vector_width);
            }
        }

    }
    return 0;
}

Answer 1

简短回答：您正在查询它，平台编译器知道什么是最佳矢量宽度大小。所以是的，它的值是正确的。

长答案：对于CPU（任何类型的CPU），它可能更喜欢非向量。特别是在英特尔CPU +编译器上，由于英特尔编译器将矢量化作为优化过程的一部分，所以它首先要求用户不要对代码进行矢量化。

事实上，看起来nVIDIA也更喜欢用户输入非矢量化代码。这并不意味着如果已经向量化，代码将运行得更慢。它只是意味着编译器（由于它具有的优化技术）更喜欢代码是非开发的。

更新OpenCL驱动程序可能会导致更改这些值。此外，你应该把它们作为定向。其他因素如：本地内存使用，合并的全局访问，本地大小等......通常更为重要。

Answer 2

这是我做过的一个实验，看看矢量化操作如何在喜欢进行标量运算的设备中执行。我用两个不同的内核实现了约简算法。第一个内核将数据视为标量，而第二个内核将数据视为float4向量（代码如下）。这是执行结果。很明显，虽然NVIDIA设备更喜欢非矢量化操作，但矢量化操作更快。

优选的矢量宽度：1 reduction_scalar：检查通过。总时间= 4471424 reduction_vector：检查通过。总时间= 1723776

以下是代码：

#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction.cl"

#define ARRAY_SIZE 1048576
#define NUM_KERNELS 2

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {

    cl_platform_id platform;
    cl_device_id dev;
    int err;

    /* Identify a platform */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err < 0) {
        perror("Couldn't identify a platform");
        exit(1);
    }

    /* Access a device */
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
    if (err == CL_DEVICE_NOT_FOUND) {
        printf(" GPU is not first! Going on CPU :(");
        err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
    }
    if (err < 0) {
        perror("Couldn't access any devices");
        exit(1);
    }

    return dev;
}

/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

    cl_program program;
    FILE *program_handle;
    char *program_buffer, *program_log;
    size_t program_size, log_size;
    int err;

    /* Read program file and place content into buffer */
    program_handle = fopen(filename, "r");
    if (program_handle == NULL) {
        perror("Couldn't find the program file");
        exit(1);
    }
    fseek(program_handle, 0, SEEK_END);
    program_size = ftell(program_handle);
    rewind(program_handle);
    program_buffer = (char*)malloc(program_size + 1);
    program_buffer[program_size] = '\0';
    fread(program_buffer, sizeof(char), program_size, program_handle);
    fclose(program_handle);

    /* Create program from file */
    program = clCreateProgramWithSource(ctx, 1,
        (const char**)&program_buffer, &program_size, &err);
    if (err < 0) {
        perror("Couldn't create the program");
        exit(1);
    }
    free(program_buffer);

    /* Build program */
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err < 0) {

        /* Find size of log and print to std output */
        clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
            0, NULL, &log_size);
        program_log = (char*)malloc(log_size + 1);
        program_log[log_size] = '\0';
        clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
            log_size + 1, program_log, NULL);
        printf("%s\n", program_log);
        free(program_log);
        exit(1);
    }

    return program;
}

int main() {

    /* OpenCL structures */
    cl_device_id device;
    cl_context context;
    cl_program program;
    cl_kernel kernel[NUM_KERNELS];
    cl_command_queue queue;
    cl_event prof_event;
    cl_int i, j, err, preferred_width;
    size_t local_size, global_size;
    char kernel_names[NUM_KERNELS][20] =
    { "reduction_scalar", "reduction_vector" };

    /* Data and buffers */
    float *data = (float *)malloc(sizeof(float)* ARRAY_SIZE);
    //float data[ARRAY_SIZE];
    float sum, actual_sum, *scalar_sum, *vector_sum;
    cl_mem data_buffer, scalar_sum_buffer, vector_sum_buffer;
    cl_int num_groups;
    cl_ulong time_start, time_end, total_time;

    /* Initialize data */
    for (i = 0; i<ARRAY_SIZE; i++) {
        data[i] = 1.0f*i;
    }

    /* Create device and determine local size */
    device = create_device();
    clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
        sizeof(preferred_width), &preferred_width, NULL);
    printf("Preferred vector width: %d\n", preferred_width);
    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
        sizeof(local_size), &local_size, NULL);
    if (err < 0) {
        perror("Couldn't obtain device information");
        exit(1);
    }

    /* Allocate and initialize output arrays */
    num_groups = ARRAY_SIZE / local_size;
    scalar_sum = (float*)malloc(num_groups * sizeof(float));
    vector_sum = (float*)malloc(num_groups / 4 * sizeof(float));
    for (i = 0; i<num_groups; i++) {
        scalar_sum[i] = 0.0f;
    }
    for (i = 0; i<num_groups / 4; i++) {
        vector_sum[i] = 0.0f;
    }

    /* Create a context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if (err < 0) {
        perror("Couldn't create a context");
        exit(1);
    }

    /* Build program */
    program = build_program(context, device, PROGRAM_FILE);

    /* Create data buffer */
    data_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
        CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
    scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
        CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), scalar_sum, &err);
    vector_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
        CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), vector_sum, &err);
    if (err < 0) {
        perror("Couldn't create a buffer");
        exit(1);
    };

    /* Create a command queue */
    queue = clCreateCommandQueue(context, device,
        CL_QUEUE_PROFILING_ENABLE, &err);
    if (err < 0) {
        perror("Couldn't create a command queue");
        exit(1);
    };

    for (i = 0; i<NUM_KERNELS; i++) {

        /* Create a kernel */
        kernel[i] = clCreateKernel(program, kernel_names[i], &err);
        if (err < 0) {
            perror("Couldn't create a kernel");
            exit(1);
        };

        /* Create kernel arguments */
        err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &data_buffer);
        if (i == 0) {
            global_size = ARRAY_SIZE;
            err |= clSetKernelArg(kernel[i], 1, local_size * sizeof(float), NULL);
            err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &scalar_sum_buffer);
        }
        else {
            global_size = ARRAY_SIZE / 4;
            err |= clSetKernelArg(kernel[i], 1, local_size * 4 * sizeof(float), NULL);
            err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &vector_sum_buffer);
        }
        if (err < 0) {
            perror("Couldn't create a kernel argument");
            exit(1);
        }

        /* Enqueue kernel */
        err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, &global_size,
            &local_size, 0, NULL, &prof_event);
        if (err < 0) {
            perror("Couldn't enqueue the kernel");
            exit(1);
        }

        /* Finish processing the queue and get profiling information */
        clFinish(queue);
        clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
            sizeof(time_start), &time_start, NULL);
        clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
            sizeof(time_end), &time_end, NULL);
        total_time = time_end - time_start;

        /* Read the result */
        if (i == 0) {
            err = clEnqueueReadBuffer(queue, scalar_sum_buffer, CL_TRUE, 0,
                num_groups * sizeof(float), scalar_sum, 0, NULL, NULL);
            if (err < 0) {
                perror("Couldn't read the buffer");
                exit(1);
            }
            sum = 0.0f;
            for (j = 0; j<num_groups; j++) {
                sum += scalar_sum[j];
            }
        }
        else {
            err = clEnqueueReadBuffer(queue, vector_sum_buffer, CL_TRUE, 0,
                num_groups / 4 * sizeof(float), vector_sum, 0, NULL, NULL);
            if (err < 0) {
                perror("Couldn't read the buffer");
                exit(1);
            }
            sum = 0.0f;
            for (j = 0; j<num_groups / 4; j++) {
                sum += vector_sum[j];
            }
        }

        /* Check result */
        printf("%s: ", kernel_names[i]);
        actual_sum = 1.0f * ARRAY_SIZE / 2 * (ARRAY_SIZE - 1);
        if (fabs(sum - actual_sum) > 0.01*fabs(sum))
            printf("Check failed.\n");
        else
            printf("Check passed.\n");
        printf("Total time = %lu\n\n", total_time);

        /* Deallocate event */
        clReleaseEvent(prof_event);
    }

    /* Deallocate resources */
    free(scalar_sum);
    free(vector_sum);
    for (i = 0; i<NUM_KERNELS; i++) {
        clReleaseKernel(kernel[i]);
    }
    clReleaseMemObject(scalar_sum_buffer);
    clReleaseMemObject(vector_sum_buffer);
    clReleaseMemObject(data_buffer);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
    clReleaseContext(context);
    return 0;
}

和内核：

__kernel void reduction_scalar(__global float* data, 
      __local float* partial_sums, __global float* output) {

   int lid = get_local_id(0);
   int group_size = get_local_size(0);

   partial_sums[lid] = data[get_global_id(0)];
   barrier(CLK_LOCAL_MEM_FENCE);

   for(int i = group_size/2; i>0; i >>= 1) {
      if(lid < i) {
         partial_sums[lid] += partial_sums[lid + i];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
   }

   if(lid == 0) {
      output[get_group_id(0)] = partial_sums[0];
   }
}

__kernel void reduction_vector(__global float4* data, 
      __local float4* partial_sums, __global float* output) {

   int lid = get_local_id(0);
   int group_size = get_local_size(0);

   partial_sums[lid] = data[get_global_id(0)];
   barrier(CLK_LOCAL_MEM_FENCE);

   for(int i = group_size/2; i>0; i >>= 1) {
      if(lid < i) {
         partial_sums[lid] += partial_sums[lid + i];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
   }

   if(lid == 0) {
      output[get_group_id(0)] = dot(partial_sums[0], (float4)(1.0f));
   }
}

opencl设备中的首选矢量宽度

2 个答案: