我正在研究高斯模糊,为了加速算法执行,我正在考虑将图像分成几部分并为每个部分应用线程。 不幸的是,算法恰恰相反,它只是减慢了速度。由于我是线程新手,我想知道这是我做错了还是整个概念都错了。
关于我的高斯算法的一些信息,我使用半核技巧作为高斯内核的左右两侧是相同的。主要方法是RunGaussian,但线程是在GaussianWithThreading方法中执行的。
重要说明:我受单声道框架限制,无法使用任务。这不是最终版本,它是为测试线程执行方式而准备的快速模型版本。
算法概念:
1)拍照,转换为灰度获取一系列值(此处未包含)。
2)根据使用的线程数将数组划分为多个段。
3)每个线程对输入数组更新x_values全局数组中的选定值的x维执行计算。
4)3)完成后,每个线程对x维度全局数组中选定值的y维执行计算。
5)完成4)后,将新的灰度值转换为纹理(此处不包括)。
public class Gaussian
{
private float [] colors;
private float [] x_values;
private float [] y_values;
private int width = 2000;
private int height = 1900;
private int thread_pool;
public float[] RunGaussian(int number_of_threads, float [] colors, float [] kernel)
{
this.colors = colors;
x_values = new float[width*height];
y_values = new float[width*height];
thread_pool = number_of_threads;
return GaussianWithThreading(kernel,kernel);
}
private float[] GaussianWithThreading(float [] kernel_x, float [] kernel_y)
{
Thread [] threads = new Thread[thread_pool];
int per_thread = colors.Length/thread_pool;
//x-dimension
for (var i = 0; i<thread_pool;i++)
{
int start = (i == 0)? 0 : per_thread * i;
int stop = (i == thread_pool)? colors.Length-1 : (per_thread * (i+1))-1;
threads[i] = new Thread(()=> ApplyWithThread(0,kernel_x, kernel_y, colors, start, stop));
threads[i].Start();
}
bool stillActive = true;
while (stillActive)
{
int count = 0;
foreach (var t in threads)
{
if (!t.IsAlive) count++;
}
if (count == thread_pool) stillActive = false;
}
for (var i = 0; i<thread_pool;i++)
{
int start = (i == 0)? 0 : per_thread * i;
int stop = (i == thread_pool)? colors.Length-1 : (per_thread * (i+1))-1;
threads[i] = new Thread(()=> ApplyWithThread(1,kernel_x, kernel_y, x_values,start, stop));
threads[i].Start();
}
stillActive = true;
while (stillActive)
{
int count = 0;
foreach (var t in threads)
{
if (!t.IsAlive) count++;
}
if (count == thread_pool) stillActive = false;
}
return y_values;
}
private void ApplyWithThread(int dimension, float [] kernel_x, float [] kernel_y, float [] values, int start_point, int finish_point)
{
//x-dimension
if (dimension == 0)
{
for (int i = start_point; i<=finish_point;i++)
{
int x = i%width;
int y = i/width;
int y_cor = y * width;
float val = values[i] * kernel_x[0];
for (int j = 1, n= kernel_x.Length; j<n;j++)
{
int previous = x - j;
if (previous < 0) previous = 0;
int p_pos = y_cor + previous;
int next = x + j;
if (next>=width) next = width-1;
int n_pos = y_cor + next;
val += (values[p_pos] * kernel_x[j]) + (values[n_pos] * kernel_x[j]);
}
x_values[i] = val;
}
}
//y-dimension
else
{
for (int i = start_point; i<=finish_point;i++)
{
int x = i%width;
int y = i/width;
float val = values[i] * kernel_y[0];
for (int j = 1, n= kernel_y.Length; j<n;j++)
{
int previous = y - j;
if (previous < 0) previous = 0;
int p_pos = previous * width + x;
int next = y + j;
if (next>=height) next = height-1;
int n_pos = next * width + x;
val += (values[p_pos] * kernel_y[j]) + (values[n_pos] * kernel_y[j]);
}
y_values[i] = val;
}
}
}
}
好的,我现在到了某个地方。 我已经使用ThreadPool,ManualResetEvents和WaitHandle来确保所有线程收敛然后需要。 我还修改了 Harold ,内核应用程序算法来使用转置,虽然我没有分开边界检查(我想到的每个场景都使它比当前版本更复杂)。 还有一些工作要做,因为WaitHandle只允许64个事件,迫使我使用不超过64个线程。此外,当我使用63个线程时,我注意到我的计算机变得迟钝(尖峰,可能是由于WaitHandle),这让我觉得WaitHandle不是最好的选择。遗憾的是,我是.NET 4.0之前的版本,不能使用Barrier类。请参阅下面的代码段。 G_Data只是结构使它更具视觉吸引力,我可能会抛弃它(应该给我一些提升)。
测试数据(仅过滤器应用程序,无纹理&lt; - &gt;数据转换或内核准备):
纹理 - 2048x1421
从结果可以很容易地看出,对于小内核线程数的差异是微不足道的,但随着内核数量增加线程起着重要作用(这并不奇怪)。
public float[] ApplyFilter(int thread_count, float [] kernel_x, float [] kernel_y, float [] values, int width, int height)
{
this.thread_count = thread_count;
this.kernel_x = kernel_x;
this.kernel_y = kernel_y;
this.values = values;
this.width = width;
this.height = height;
x_values = new float[width * height];
y_values = new float[width * height];
int per_thread_x = height/thread_count;
int per_thread_y = width/thread_count;
var events = new ManualResetEvent[thread_count];
//apply x-pass
for (int i = 0; i<thread_count;i++)
{
events[i] = new ManualResetEvent(false);
int start = per_thread_x * i;
int stop = (i==thread_count-1)? height-1: (per_thread_x * (i + 1)) - 1;
if (stop < start) stop = start;
int j = i;
G_Data data = new G_Data(start, stop, width, height, values, x_values, kernel_x);
ThreadPool.QueueUserWorkItem(x => { ApplyKernel(data); events[j].Set();});
}
WaitHandle.WaitAll(events);
//apply y-pass
for (int i = 0; i < thread_count; i++)
{
events[i] = new ManualResetEvent(false);
int start = per_thread_y * i;
int stop = (i == thread_count - 1) ? width - 1 : (per_thread_y * (i + 1)) - 1;
if (stop < start) stop = start;
int j = i;
G_Data data = new G_Data(start, stop, height, width, x_values, y_values, kernel_y);
ThreadPool.QueueUserWorkItem(x => { ApplyKernel(data); events[j].Set(); });
}
WaitHandle.WaitAll(events);
return y_values;
}
如果您惊讶地看到宽度/高度被交换为y-pass,那是因为数组转置操作。
内核算法:
public void ApplyKernel(object args)
{
G_Data input_data = (G_Data)args;
int start = input_data.start;
int stop = input_data.stop;
float[] kernel = input_data.kernel;
float[] input = input_data.input_data;
float[] output = input_data.output_data;
int input_width = input_data.width;
int input_height = input_data.height;
for (int y = start; y <= stop; y++)
{
int row = y * input_width;
for (int x = 0; x < input_width; x++)
{
float val = input[row + x] * kernel[0];
for (int k = 1, n = kernel.Length; k < n; k++)
{
int p_x = ((x - k) < 0) ? 0 : x - k;
int n_x = ((x + k) >= input_width) ? input_width - 1 : x + k;
val += (input[row + p_x] * kernel[k]) + (input[row + n_x] * kernel[k]);
}
output[x * input_height + y] = val;
}
}
}
内核算法版本2
public void ApplyKernel(object args)
{
G_Data input_data = (G_Data)args;
int start = input_data.start;
int stop = input_data.stop;
float[] kernel = input_data.kernel;
float[] input = input_data.input_data;
float[] output = input_data.output_data;
int input_width = input_data.width;
int input_height = input_data.height;
int offset = kernel.Length;
bool is_kernel_oversized = offset >= input_width / 2;
int right_edge_start = (is_kernel_oversized) ? offset : input_width - offset;
int y, x;
for (y = start; y <= stop; y++)
{
int row = y * input_width;
//EDGE HANDLING
//left edge
for (x = 0; x < offset; x++)
{
float val = input[row + x] * kernel[0];
for (int k = 1, n = kernel.Length; k < n; k++)
{
int p_x = ((x - k) < 0) ? 0 : x - k;
val += (input[row + p_x] * kernel[k]) + (input[row + x + k] * kernel[k]);
}
output[x * input_height + y] = val;
}
//right edge
for (x = right_edge_start; x < input_width; x++)
{
float val = input[row + x] * kernel[0];
for (int k = 1, n = kernel.Length; k < n; k++)
{
int n_x = ((x + k) >= input_width) ? input_width - 1 : x + k;
val += (input[row + x - k] * kernel[k]) + (input[row + n_x] * kernel[k]);
}
output[x * input_height + y] = val;
}
//BODY SCAN
//if kernel is oversized there is no point in doing body scan as it was already covered by edge passes above
if (is_kernel_oversized) continue;
for (x = offset; x < input_width - offset; x++)
{
float val = input[row + x] * kernel[0];
for (int k = 1, n = kernel.Length; k < n; k++)
{
val += (input[row + x-k] * kernel[k]) + (input[row + x+k] * kernel[k]);
}
output[x * input_height + y] = val;
}
}
}