我是使用OpenCL(使用OpenCL.NET库)和Visual Studio C#的新手,目前正在开发一个计算大型3D矩阵的应用程序。在矩阵中的每个像素处,计算192个唯一值,然后求和以产生该像素的最终值。所以,在功能上,它就像一个4-D矩阵,(161 x 161 x 161)x 192.
现在我正在从我的主机代码中调用内核:
//C# host code
...
float[] BigMatrix = new float[161*161*161]; //1-D result array
CLCalc.Program.Variable dev_BigMatrix = new CLCalc.Program.Variable(BigMatrix);
CLCalc.Program.Variable dev_OtherArray = new CLCalc.Program.Variable(otherArray);
//...load some other variables here too.
CLCalc.Program.Variable[] args = new CLCalc.Program.Variable[7] {//stuff...}
//Here, I execute the kernel, with a 2-dimensional worker pool:
BigMatrixCalc.Execute(args, new int[2]{N*N*N,192});
dev_BigMatrix.ReadFromDeviceTo(BigMatrix);
示例内核代码发布在下面。
__kernel void MyKernel(
__global float * BigMatrix
__global float * otherArray
//various other variables...
)
{
int N = 161; //Size of matrix edges
int pixel_id = get_global_id(0); //The location of the pixel in the 1D array
int array_id = get_global_id(1); //The location within the otherArray
//Finding the x,y,z values of the pixel_id.
float3 p;
p.x = pixel_id % N;
p.y = ((pixel_id % (N*N))-p.x)/N;
p.z = (pixel_id - p.x - p.y*N)/(N*N);
float result;
//...
//Some long calculation for 'result' involving otherArray and p...
//...
BigMatrix[pixel_id] += result;
}
我的代码目前有效,但是我正在寻找这个应用程序的速度,我不确定我的工作人员/组设置是否是最好的方法(即161 * 161 * 161和192工作池的维度)。
我已经看到了将全局工作池组织到本地工作组以提高效率的其他示例,但我不太确定如何在OpenCL.NET中实现它。我也不确定这与仅在工作池中创建另一个维度有什么不同。
所以,我的问题是:我可以在这里使用本地小组,如果是这样,我将如何组织它们?一般来说,如何使用本地组不仅仅是调用n维工作池? (即调用Execute(args,new int [] {(N * N * N),192}),而本地工作组大小为192?)
感谢您的帮助!
答案 0 :(得分:1)
我有几点建议:
希望这有帮助!
答案 1 :(得分:1)
我认为等待内存访问会导致很多性能丢失。我已经回复了similar SO question.我希望我的帖子可以帮助你。如果您有任何问题,请提出。
优化:
...
#define N 161
#define Nsqr N*N
#define Ncub N*N*N
#define otherSize 192
__kernel void MyKernel(__global float * BigMatrix, __global float * otherArray)
{
//using 1 quarter of the total size of the matrix
//this work item will be responsible for computing 4 consecutive values in BigMatrix
//also reduces global size to (N^3)/4 ~= 1043000 for N=161
int global_id = get_global_id(0) * 4; //The location of the first pixel in the 1D array
int pixel_id;
//array_id won't be used anymore. work items will process BigMatrix[pixel_id] entirely
int local_id = get_local_id(0); //work item id within the group
int local_size = get_local_size(0); //size of group
float result[4]; //result cached for 4 global values
int i, j;
float3 p;
//cache the values in otherArray to local memory
//now each work item in the group will be able to read the values efficently
//each element in otherArray will be read a total of N^3 times, so this is important
//opencl specifies at least 16kb of local memory, so up to 4k floats will work fine
__local float otherValues[otherSize];
for(i=local_id; i<otherSize; i+= local_size){
otherValues[i] = otherArray[i];
}
mem_fence(CLK_LOCAL_MEM_FENCE);
//now this work item can compute the complete result for pixel_id
for(j=0;j<4;j++){
result[j] = 0;
pixel_id = global_id + j;
//Finding the x,y,z values of the pixel_id.
//TODO: optimize the calculation of p.y and p.z
//they will be the same most of the time for a given work item
p.x = pixel_id % N;
p.y = ((pixel_id % Nsqr)-p.x)/N;
p.z = (pixel_id - p.x - p.y*N)/Nsqr;
for(i=0;i<otherSize;i++){
//...
//Some long calculation for 'result' involving otherValues[i] and p...
//...
//result[j] += ...
}
}
//4 consecutive writes to BigMatrix will fall in the same cacheline (faster)
BigMatrix[global_id] += result[0];
BigMatrix[global_id + 1] += result[1];
BigMatrix[global_id + 2] += result[2];
BigMatrix[global_id + 3] += result[3];
}
注意:
为了避免太多昂贵的除法和模运算,还需要对p.x / y / z进行进一步的优化。见下面的代码。
__kernel void MyKernel(__global float * BigMatrix, __global float * otherArray) {
int global_id = get_global_id(0) * 4; //The location of the first pixel in the 1D array
int pixel_id = global_id;
int local_id = get_local_id(0); //work item id within the group
int local_size = get_local_size(0); //size of group
float result[4]; //result cached for 4 global values
int i, j;
float3 p;
//Finding the initial x,y,z values of the pixel_id.
p.x = pixel_id % N;
p.y = ((pixel_id % Nsqr)-p.x)/N;
p.z = (pixel_id - p.x - p.y*N)/Nsqr;
//cache the values here. same as above...
//now this work item can compute the complete result for pixel_id
for(j=0;j<4;j++){
result[j] = 0;
//increment the x,y,and z values instead of computing them all from scratch
p.x += 1;
if(p.x >= N){
p.x = 0;
p.y += 1;
if(p.y >= N){
p.y = 0;
p.z += 1;
}
}
for(i=0;i<otherSize;i++){
//same i loop as above...
}
}