实际上我有一个用于矩阵乘法的可视化c#程序,它使用i3处理器中的所有逻辑核心,但我想知道如何在c及其解释中实现。该计划是,
using System;
using System.Diagnostics;
using System.Threading.Tasks;
namespace MatrixMultiplication
{
internal class Program
{
#region Sequential_Loop
private static void MultiplyMatricesSequential(double[,] matA, double[,] matB,
double[,] result)
{
int matACols = matA.GetLength(1);
int matBCols = matB.GetLength(1);
int matARows = matA.GetLength(0);
for (int i = 0; i < matARows; i++)
{
for (int j = 0; j < matBCols; j++)
{
for (int k = 0; k < matACols; k++)
{
result[i, j] += matA[i, k]*matB[k, j];
}
}
}
}
#endregion
#region Parallel_Loop
private static void MultiplyMatricesParallel(double[,] matA, double[,] matB, double[,] result)
{
int matACols = matA.GetLength(1);
int matBCols = matB.GetLength(1);
int matARows = matA.GetLength(0);
// A basic matrix multiplication.
// Parallelize the outer loop to partition the source array by rows.
Parallel.For(0, matARows, i =>
{
for (int j = 0; j < matBCols; j++)
{
// Use a temporary to improve parallel performance.
double temp = 0;
for (int k = 0; k < matACols; k++)
{
temp += matA[i, k]*matB[k, j];
}
result[i, j] = temp;
}
}); // Parallel.For
}
#endregion
#region Main
private static void Main(string[] args)
{
// Set up matrices. Use small values to better view
// result matrix. Increase the counts to see greater
// speedup in the parallel loop vs. the sequential loop.
int colCount = 800;
int rowCount = 800;
int colCount2 = 800;
double[,] m1 = InitializeMatrix(rowCount, colCount);
double[,] m2 = InitializeMatrix(colCount, colCount2);
var result = new double[rowCount,colCount2];
// First do the sequential version.
Console.WriteLine("Executing sequential loop...");
var stopwatch = new Stopwatch();
stopwatch.Start();
MultiplyMatricesSequential(m1, m2, result);
stopwatch.Stop();
Console.WriteLine("Sequential loop time in milliseconds: {0}", stopwatch.ElapsedMilliseconds);
// For the skeptics.
OfferToPrint(rowCount, colCount2, result);
// Reset timer and results matrix.
stopwatch.Reset();
result = new double[rowCount,colCount2];
// Do the parallel loop.
Console.WriteLine("Executing parallel loop...");
stopwatch.Start();
MultiplyMatricesParallel(m1, m2, result);
stopwatch.Stop();
Console.WriteLine("Parallel loop time in milliseconds: {0}", stopwatch.ElapsedMilliseconds);
OfferToPrint(rowCount, colCount2, result);
// Keep the console window open in debug mode.
Console.WriteLine("Press any key to exit.");
Console.ReadKey();
}
#endregion
#region Helper_Methods
private static double[,] InitializeMatrix(int rows, int cols)
{
var matrix = new double[rows,cols];
var r = new Random();
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
matrix[i, j] = r.Next(100);
}
}
return matrix;
}
private static void OfferToPrint(int rowCount, int colCount, double[,] matrix)
{
Console.WriteLine("Computation complete. Print results? y/n");
char c = Console.ReadKey().KeyChar;
if (c == 'y' || c == 'Y')
{
Console.WindowWidth = 180;
Console.WriteLine();
for (int x = 0; x < rowCount; x++)
{
Console.WriteLine("ROW {0}: ", x);
for (int y = 0; y < colCount; y++)
{
Console.Write("{0:#.##} ", matrix[x, y]);
}
Console.WriteLine();
}
}
}
#endregion
}
}
答案 0 :(得分:1)
矩阵乘法是一个令人愉快的并行&#34;问题,即它可以简单地并行化,因为结果数组中的每个单元都与任何其他单元的值无关。
您的代码中的并行解决方案甚至可以通过在不在行上的单元格上进行分区来进一步并行化,例如:通过将结果矩阵中的每个单元格编号为j + i * matBCols( CAVEAT 我没有检查此代码,我可能已经切换了一些索引,如果发现错误请发表评论):< / p>
private static void MultiplyMatricesParallel(double[,] matA, double[,] matB, double[,] result)
{
int matACols = matA.GetLength(1);
int matBCols = matB.GetLength(1);
int matARows = matA.GetLength(0);
// A basic matrix multiplication.
// Parallelize the outer loop to partition the source array by rows.
Parallel.For(0, matARows*matBCols, ij =>
{
i = ij / matBCols;
j = ij % matBCols;
// Use a temporary to improve parallel performance.
double temp = 0;
for (int k = 0; k < matACols; k++)
{
temp += matA[i, k]*matB[k, j];
}
result[i, j] = temp;
}); // Parallel.For
}
在C中完成这项工作的简单方法是为结果矩阵中的每个单元格创建一个线程,但这会浪费并且次优,这是因为Parallel.For实际上正在进行一些场景下的计算优化计算速度。
在最好的情况下,我们想要对数组进行分区,以便每个内核获得相等的数组乘法份额。在包含Parallel的任务并行库(TPL)中。对于每个单元计算(在我的示例中)或行计算(在原始中)被转换为任务。 Parallel.For考虑核心数量并为每个核心分配工作线程,尝试在核心和最小线程数之间保持工作平衡。在具有2个内核的完美场景中,这将是两个线程,每个线程具有一半的矩阵乘法。然而,TPL内置了动态平衡。
例如,如果其中一个核心变忙(例如运行另一个进程)或其中一个工作线程被阻塞(例如,等待虚拟内存中的阻塞),那么TPL将产生更多线程并重新平衡工作负荷。
您可以阅读here。
我想说的是复制Parallel.For的工作并不是一项微不足道的工作。对于矩阵乘法的情况,您可以通过放弃动态重新分配任务来获得良好的传真。只需创建与每个线程具有核心亲和力的CPU核心一样多的线程,并在它们之间平均分配矩阵。
在Windows中,您可以使用以下内容获取核心数:GetSystemInfo
(或参见here了解其他选项),并使用CreateThread
和SetThreadAffinityMask
创建具有核心相关性的线程