我一直在努力编写一个程序,用特定的分母计算Cantor集合的有理数。我发现用我的电脑计算3 ^ 14到3 ^ 15之间的数字需要20个小时或更长时间。我认为,由于这是测试大量单独的值,因此在使用OpenCL的图形卡上实现是一件好事。当我试图实现它时,虽然我的性能比我的CPU实现慢了几个数量级。这是我尝试的代码。
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <functional>
#include <ctime>
#include <iostream>
#include <fstream>
#include <exception>
#include <cstdlib>
#include <vector>
#include <thread>
#include <cmath>
#include <string>
#include <algorithm>
#include <thread>
#include <cmath>
#include <sstream>
#define SUCCESS 0
#define FAILURE 1
#define EXPECTED_FAILURE 2
const int NUM_ELEMENTS = 32768;
void printOutput(unsigned long long start, unsigned long long *values){
for(unsigned int i = 0; i < NUM_ELEMENTS; i++)
if (values[i] != 0)
std::cout << start+i << ',' << values[i] << std::endl;
}
void newList(unsigned long long start, unsigned long long *dataList){
for(int i=0; i < NUM_ELEMENTS; ++i)
dataList[i] = start + i;
}
using namespace cl;
Kernel kernelA;
Context context;
CommandQueue queue;
Buffer inputBuffer;
Buffer outputBuffer;
int init() {
cl_int status = 0;
const char* buildOption ="-x clc++ ";
std::vector<Platform> platforms;
status = Platform::get(&platforms);
if (status != CL_SUCCESS){
std::cout<<"Error: Getting platforms!"<<std::endl;
return FAILURE;
}
std::vector<cl::Platform>::iterator iter;
for(iter = platforms.begin(); iter != platforms.end(); ++iter)
if(!strcmp((*iter).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc."))
break;
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*iter) (), 0};
bool gpuNotFound = false;
try{
context = cl::Context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &status);
}
catch(std::exception e){
gpuNotFound = true;
}
if(gpuNotFound){
std::cout<<"GPU not found, falling back to CPU!"<<std::endl;
context = cl::Context(CL_DEVICE_TYPE_CPU, cps, NULL, NULL, &status);
if (status != CL_SUCCESS){
std::cout<<"Error: Creating context!"<<std::endl;
return FAILURE;
}
}
Program program;
try{
std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
queue = CommandQueue(context, devices[0]);
std::ifstream sourceFile("Rationals.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
program = Program(context, source);
program.build(devices, buildOption);
kernelA = Kernel(program, "countRationals");
inputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
outputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
}catch(cl::Error e){
std::cout << e.what() << std::endl;
std::cout << "Build Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(cl::Device::getDefault()) << std::endl;
std::cout << "Build Options:\t" << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(cl::Device::getDefault()) << std::endl;
std::cout << "Build Log:\t " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(cl::Device::getDefault()) << std::endl;
return FAILURE;
}
return SUCCESS;
}
int execute(unsigned long long* inputList, unsigned long long* outputList) {
try{
queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), inputList);
kernelA.setArg(0, inputBuffer);
kernelA.setArg(1, outputBuffer);
NDRange global(NUM_ELEMENTS/2);
NDRange local(256);
queue.enqueueNDRangeKernel(kernelA, NullRange, global, local);
queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), outputList);
}catch(cl::Error e){
std::cout << "Line "<< __LINE__<<": Error in "<<e.what() <<std::endl;
return FAILURE;
}
return SUCCESS;
}
using namespace std;
int main(int argc, char* argv[]){
unsigned long long minNum, maxNum;
if (argc == 2){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[1]) + 1);
}
else if (argc == 3){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[2]));
}
else if (argc == 4){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[2]));
}
else return -1;
unsigned long long *q = nullptr, *result = nullptr, *old = nullptr, *newq = nullptr;
thread workThread, outThread, genThread;
q = new unsigned long long[NUM_ELEMENTS];
newList(minNum, q);
result = new unsigned long long[NUM_ELEMENTS];
newq = new unsigned long long[NUM_ELEMENTS];
init();
genThread = thread(newList, minNum+NUM_ELEMENTS, newq);
workThread = thread(execute, q, result);
workThread.join();
genThread.join();
for(unsigned long long i = minNum + NUM_ELEMENTS; i < maxNum + NUM_ELEMENTS; i += NUM_ELEMENTS){
old = result;
q = newq;
result = new unsigned long long[NUM_ELEMENTS];
newq = new unsigned long long[NUM_ELEMENTS];
genThread = thread(newList, i+NUM_ELEMENTS, newq);
workThread = thread(execute, q, result);
outThread = thread(printOutput, i-NUM_ELEMENTS, old);
workThread.join();
outThread.join();
genThread.join();
delete[] old;
delete[] q;
q = old = nullptr;
}
delete[] newq;
delete[] result;
return 0;
}
内核代码
bool testCantor(unsigned long p, unsigned long q){
while(q % 3 == 0){
q /= 3;
if (p/q == 1) return p==q;
p %= q;
}
unsigned long p_start = p;
do{
unsigned long p3 = p * 3;
if(p3/q == 1) return false;
p = p3 % q;
} while(p != p_start);
return true;
}
int coprime(unsigned long a, unsigned long b){
unsigned long c;
while (a != 0){
c = a;
a = b % a;
b = c;
}
return 2*((b == 1)&1);
}
__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output){
int gid = get_global_id(0);
unsigned long q = input[gid], p = 1;
output[gid] = 0;
for(p = 1; p <= q/3; p++){
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
gid = 32767 - get_global_id(0);
q = input[gid];
output[gid] = 0;
for(p = 1; p <= q/3; p++){
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
}
我有更好的方法来实现这个吗?我是OpenCL的新手(我在不到24小时之前开始使用它)所以我可能会犯一些相当明显的错误。
编辑:我发现我只生成2个线程。我已将其更改为生成32个线程,每个256 q。它现在从13到14运行时崩溃,但我不明白为什么。它不会从10到11崩溃EDIT2:我实现了大部分建议(无法弄清楚如何删除if(互质(p,q)))现在它运行得更快一些(n = 10时差不到一秒) 。我还能做些什么来加快速度吗?它在同一任务上的运行速度仅比我的处理器快33%。
EDIT3:设法用bit twiddling来实现它。不确定是否还有其他条件我可以这样做。仍然没有看到非常大的性能提升(任何建议?)
答案 0 :(得分:1)
int execute(unsigned long long* inputList, unsigned long long* outputList) {
try
{
...
}
catch(cl::Error e)
{
...
}
return SUCCESS;
正在创建缓冲区。如果您多次使用execute(),它将具有缓冲区创建/垃圾收集开销。此外,您的全局范围仅为本地范围的两倍,这意味着将仅使用您的gpu的两个计算单元。如果您的卡有20个计算单元,则全局范围应至少为40 *本地范围。只有512个元素不足以保持gpu忙碌。至少有一半的核心。对于所有核,for(p = 1; p <= q / 3; p ++)循环不相同。有些内核计数为10而另一个内核计为100,这会破坏内核之间的执行顺序。你应该做一个更平衡的内核。例如:
输入第一个核心来计算第一个和最后一个元素,第二个核心计算第二个和N-1,....所以所有核心几乎都是相等的工作而不是空闲等待后者核心。
__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output)
{
// computing first element (least workload among the array)
int gid = get_global_id(0);
unsigned long q = input[gid], p = 1;
output[gid] = 0;
for(p = 1; p <= q/3; p++) // counts to 10 ....
{
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
if(coprime(i,q))
output[gid] += 2;
}
//+ computing (N-gid) element (heaviest workload among the array)
int N_gid = findOtherIndex(get_global_id(0));
unsigned long N_q = input[N_gid], N_p = 1;
output[N_gid] = 0;
for(N_p = 1; N_p <= N_q/3; N_p++) // counts to 100?
{
if(N_p % 3 != 0 && testCantor(N_p, N_q))
for(unsigned long i = p; i <= q/3; i *= 3)
if(coprime(i,N_q))
output[N_gid] += 2;
}
//this way, adjacent cores will have "closer to equal" work.
}
所以,如果你有4096个元素,第一个核心将计算第一个和第4096个元素,第二个核心将计算第二个和第4095个元素,......本地范围64和全局范围4096应该可以开始。如果你使用太多“if”,那么你应该为它们中的每一个添加一个“else”来做虚拟工作以保持核心之间的计算顺序。或者你可以删除一些“if”,只要它们如下:
if(a>b)c+=d;
可以被截取为
c+=d*bitTwiddle_and_absoluteValue(a,b); // does only computation, not branching is good for gpu.
implement bitTwiddle_and_absoluteValue(a,b) such that it returns zero when a<=b and 1 when a>b
编辑:
giving global size a multiple of number of cores of GPU could give an extre performance.
编辑:让我们优化
for(p = 1; p <= q/3; p++){
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
p%3!= 0表示只有1或2.
p = 1,4,7,10满足p%3 == 1,... =&gt;我们的第一个循环
p = 2,5,8,11满足p%3 == 2,... =&gt;我们的第二个循环
让我们连接这些:
for(p = 1; p <= q/3; p+=3){ // p%3==1 is satisfied
if(testCantor(p, q)) // so no need for testing modulus
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
for(p = 2; p <= q/3; p+=3){ // p%3==2 is satisfied
if(testCantor(p, q)) // so no need for testing modulus
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
//so we got rid of this part:
for(p = 0; p <= q/3; p+=3){ // p%3==0 is not engaging "if" so we dont need
if(testCantor(p, q)) // this loop anymore lol :D
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
作为奖励,总循环迭代减少了1/3,这应该会略微增加。
编辑:while循环具有模数,不使用GPU的浮点潜力。
//here convert integers to floats a,b,c
while (a != 0){ // this will need a tolerance range, exact zero is nearly impossible
c = a;
a = b % a; //emulate this using fp
// example: 5%3 --> 5.0 / 3.0 gives 1.yyy so we have 1 at least
// then we subtract like: 5.0 - floor(5.0/3.0)*3.0
// we have 2.0 which is 5%3
// this is just a single condition
// looks like b%a can be b-floor(b/a)*a but Im not sure
// good luck!
b = c;
}
// here convert floats back to integers again
How can one emulate modulus with using only fp arithmetics without losing precision?