Question

我一直在努力编写一个程序，用特定的分母计算Cantor集合的有理数。我发现用我的电脑计算3 ^ 14到3 ^ 15之间的数字需要20个小时或更长时间。我认为，由于这是测试大量单独的值，因此在使用OpenCL的图形卡上实现是一件好事。当我试图实现它时，虽然我的性能比我的CPU实现慢了几个数量级。这是我尝试的代码。

#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <functional>
#include <ctime>
#include <iostream>
#include <fstream>
#include <exception>
#include <cstdlib>
#include <vector>
#include <thread>
#include <cmath>
#include <string>
#include <algorithm>
#include <thread>
#include <cmath>
#include <sstream>

#define SUCCESS 0
#define FAILURE 1
#define EXPECTED_FAILURE 2

const int NUM_ELEMENTS = 32768;

void printOutput(unsigned long long start, unsigned long long *values){
    for(unsigned int i = 0; i < NUM_ELEMENTS; i++)
       if (values[i] != 0)
            std::cout << start+i << ',' << values[i] << std::endl;
}

void newList(unsigned long long start, unsigned long long *dataList){
    for(int i=0; i < NUM_ELEMENTS; ++i)
        dataList[i] = start + i;
}

using namespace cl;

Kernel kernelA;
Context context;
CommandQueue queue;
Buffer inputBuffer;
Buffer outputBuffer;

int init() {
    cl_int status = 0;
    const char* buildOption ="-x clc++ ";
    std::vector<Platform> platforms;
    status = Platform::get(&platforms);
    if (status != CL_SUCCESS){
        std::cout<<"Error: Getting platforms!"<<std::endl;
        return FAILURE;
    }
    std::vector<cl::Platform>::iterator iter;
    for(iter = platforms.begin(); iter != platforms.end(); ++iter)
        if(!strcmp((*iter).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc."))
            break;
    cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*iter) (), 0};
    bool gpuNotFound = false;
    try{
        context = cl::Context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &status);
    }
    catch(std::exception e){
        gpuNotFound = true;
    }
    if(gpuNotFound){
        std::cout<<"GPU not found, falling back to CPU!"<<std::endl;
        context = cl::Context(CL_DEVICE_TYPE_CPU, cps, NULL, NULL, &status);
        if (status != CL_SUCCESS){
            std::cout<<"Error: Creating context!"<<std::endl;
            return FAILURE;
        }
    }
    Program program;
    try{
        std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
        queue = CommandQueue(context, devices[0]);
        std::ifstream sourceFile("Rationals.cl");
        std::string sourceCode(
            std::istreambuf_iterator<char>(sourceFile),
            (std::istreambuf_iterator<char>()));
        Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
        program = Program(context, source);
        program.build(devices, buildOption);
        kernelA = Kernel(program, "countRationals");
        inputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
        outputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
    }catch(cl::Error e){
        std::cout << e.what() << std::endl;
        std::cout << "Build Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(cl::Device::getDefault()) << std::endl;
        std::cout << "Build Options:\t" << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(cl::Device::getDefault()) << std::endl;
        std::cout << "Build Log:\t " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(cl::Device::getDefault()) << std::endl;
        return FAILURE;
    }
    return SUCCESS;
}

int execute(unsigned long long* inputList, unsigned long long* outputList) {
    try{
        queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), inputList);
        kernelA.setArg(0, inputBuffer);
        kernelA.setArg(1, outputBuffer);
        NDRange global(NUM_ELEMENTS/2);
        NDRange local(256);
        queue.enqueueNDRangeKernel(kernelA, NullRange, global, local);
        queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), outputList);
    }catch(cl::Error e){
        std::cout << "Line "<< __LINE__<<": Error in "<<e.what() <<std::endl;
        return FAILURE;
    }
    return SUCCESS;
}

using namespace std;

int main(int argc, char* argv[]){
    unsigned long long minNum, maxNum;
    if (argc == 2){
        minNum = pow(3, atoi(argv[1]));
        maxNum = pow(3, atoi(argv[1]) + 1);
    }
    else if (argc == 3){
        minNum = pow(3, atoi(argv[1]));
        maxNum = pow(3, atoi(argv[2]));
    }
    else if (argc == 4){
        minNum = pow(3, atoi(argv[1]));
        maxNum = pow(3, atoi(argv[2]));
    }
    else return -1;
    unsigned long long *q = nullptr, *result = nullptr, *old = nullptr, *newq = nullptr;
    thread workThread, outThread, genThread;
    q = new unsigned long long[NUM_ELEMENTS];
    newList(minNum, q);
    result = new unsigned long long[NUM_ELEMENTS];
    newq = new unsigned long long[NUM_ELEMENTS];
    init();
    genThread = thread(newList, minNum+NUM_ELEMENTS, newq);
    workThread = thread(execute, q, result);
    workThread.join();
    genThread.join();
    for(unsigned long long i = minNum + NUM_ELEMENTS; i < maxNum  + NUM_ELEMENTS; i += NUM_ELEMENTS){
        old = result;
        q = newq;
        result = new unsigned long long[NUM_ELEMENTS];
        newq = new unsigned long long[NUM_ELEMENTS];
        genThread = thread(newList, i+NUM_ELEMENTS, newq);
        workThread = thread(execute, q, result);
        outThread = thread(printOutput, i-NUM_ELEMENTS, old);
        workThread.join();
        outThread.join();
        genThread.join();
        delete[] old;
        delete[] q;
        q = old = nullptr;
    }
    delete[] newq;
    delete[] result;
    return 0;
}

内核代码

bool testCantor(unsigned long p, unsigned long q){
    while(q % 3 == 0){
        q /= 3;
        if (p/q == 1) return p==q;
        p %= q;
    }
    unsigned long p_start = p;
    do{
        unsigned long p3 = p * 3;
        if(p3/q == 1) return false;
        p = p3 % q;
    } while(p != p_start);
    return true;
}

int coprime(unsigned long a, unsigned long b){
    unsigned long c;
    while (a != 0){
        c = a;
        a = b % a;
        b = c;
    }
    return 2*((b == 1)&1);
}

__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output){
    int gid = get_global_id(0);
    unsigned long q = input[gid], p = 1;
    output[gid] = 0;
    for(p = 1; p <= q/3; p++){
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] += coprime(i,q);
    }
    gid = 32767 - get_global_id(0);
    q = input[gid];
    output[gid] = 0;
    for(p = 1; p <= q/3; p++){
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }
}

我有更好的方法来实现这个吗？我是OpenCL的新手（我在不到24小时之前开始使用它）所以我可能会犯一些相当明显的错误。

编辑：我发现我只生成2个线程。我已将其更改为生成32个线程，每个256 q。它现在从13到14运行时崩溃，但我不明白为什么。它不会从10到11崩溃

EDIT2：我实现了大部分建议（无法弄清楚如何删除if（互质（p，q）））现在它运行得更快一些（n = 10时差不到一秒）。我还能做些什么来加快速度吗？它在同一任务上的运行速度仅比我的处理器快33％。

EDIT3：设法用bit twiddling来实现它。不确定是否还有其他条件我可以这样做。仍然没有看到非常大的性能提升（任何建议？）

Answer 1

int execute(unsigned long long* inputList, unsigned long long* outputList) {
    try
    {
       ...
    }
    catch(cl::Error e)
    {
       ...
    }
    return SUCCESS;

正在创建缓冲区。如果您多次使用execute（），它将具有缓冲区创建/垃圾收集开销。此外，您的全局范围仅为本地范围的两倍，这意味着将仅使用您的gpu的两个计算单元。如果您的卡有20个计算单元，则全局范围应至少为40 *本地范围。只有512个元素不足以保持gpu忙碌。至少有一半的核心。对于所有核，for（p = 1; p <= q / 3; p ++）循环不相同。有些内核计数为10而另一个内核计为100，这会破坏内核之间的执行顺序。你应该做一个更平衡的内核。例如：

输入第一个核心来计算第一个和最后一个元素，第二个核心计算第二个和N-1，....所以所有核心几乎都是相等的工作而不是空闲等待后者核心。

__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output)
{
    // computing first element (least workload among the array)
    int gid = get_global_id(0);
    unsigned long q = input[gid], p = 1;
    output[gid] = 0;
    for(p = 1; p <= q/3; p++) // counts to 10 ....
    {
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                if(coprime(i,q))
                    output[gid] += 2;
    }

    //+ computing (N-gid) element (heaviest workload among the array)
    int N_gid = findOtherIndex(get_global_id(0));
    unsigned long N_q = input[N_gid], N_p = 1;
    output[N_gid] = 0;
    for(N_p = 1; N_p <= N_q/3; N_p++) // counts to 100? 
    {
        if(N_p % 3 != 0 && testCantor(N_p, N_q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                if(coprime(i,N_q))
                    output[N_gid] += 2;
    }

     //this way, adjacent cores will have "closer to equal"  work. 

}

所以，如果你有4096个元素，第一个核心将计算第一个和第4096个元素，第二个核心将计算第二个和第4095个元素，......本地范围64和全局范围4096应该可以开始。如果你使用太多“if”，那么你应该为它们中的每一个添加一个“else”来做虚拟工作以保持核心之间的计算顺序。或者你可以删除一些“if”，只要它们如下：

 if(a>b)c+=d;

可以被截取为

 c+=d*bitTwiddle_and_absoluteValue(a,b); // does only computation, not branching is good for gpu.
 implement bitTwiddle_and_absoluteValue(a,b) such that it returns zero when a<=b and 1 when a>b

编辑：

 giving global size a multiple of number of cores of GPU could give an extre performance.

编辑：让我们优化

 for(p = 1; p <= q/3; p++){
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }

p％3！= 0表示只有1或2.

p = 1,4,7,10满足

p％3 == 1，... =＆gt;我们的第一个循环

p = 2,5,8,11满足

p％3 == 2，... =＆gt;我们的第二个循环

让我们连接这些：

 for(p = 1; p <= q/3; p+=3){ // p%3==1 is satisfied
        if(testCantor(p, q)) // so no need for testing modulus
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }

 for(p = 2; p <= q/3; p+=3){ // p%3==2 is satisfied
        if(testCantor(p, q)) // so no need for testing modulus 
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }

//so we got rid of this part:
for(p = 0; p <= q/3; p+=3){      // p%3==0 is not engaging "if" so we dont need
            if(testCantor(p, q))                     // this loop anymore lol :D
                for(unsigned long i = p; i <= q/3; i *= 3)
                        output[gid] +=  coprime(i,q);
        }

作为奖励，总循环迭代减少了1/3，这应该会略微增加。

编辑：while循环具有模数，不使用GPU的浮点潜力。

//here convert integers to floats a,b,c
while (a != 0){ // this will need a tolerance range, exact zero is nearly impossible
        c = a;
        a = b % a; //emulate this using fp
         // example: 5%3 --> 5.0 / 3.0 gives 1.yyy so we have 1 at least
         // then we subtract like: 5.0 - floor(5.0/3.0)*3.0
         // we have 2.0 which is 5%3
         // this is just a single condition
         // looks like b%a can be b-floor(b/a)*a but Im not sure
         // good luck!
        b = c;
    }
// here convert floats back to integers again

How can one emulate modulus with using only fp arithmetics without losing precision?

在OpenCL中实现

1 个答案: