public static double[,] Multiply([NotNull] this double[,] m1, [NotNull] double[,] m2)
// Checks
if (m1.GetLength(1) != m2.GetLength(0)) throw new ArgumentOutOfRangeException("Invalid matrices sizes");
// Initialize the parameters and the result matrix
int h = m1.GetLength(0);
int w = m2.GetLength(1);
int l = m1.GetLength(1);
// Execute the multiplication in parallel
using (DeviceMemory2D<double> m1_device = Gpu.Default.AllocateDevice(m1))
using (DeviceMemory2D<double> m2_device = Gpu.Default.AllocateDevice(m2))
using (DeviceMemory2D<double> mresult_device = Gpu.Default.AllocateDevice<double>(h, w))
// Pointers setup
pm1 = m1_device.Ptr,
pm2 = m2_device.Ptr,
pmresult = mresult_device.Ptr;
// Local wrapper function
void Kernel(int ki)
// Calculate the current indexes
i = ki / w,
j = ki % w;
// Perform the multiplication
double sum = 0;
int im1 = i * l;
for (int k = 0; k < l; k++)
// m1[i, k] * m2[k, j]
sum += pm1[im1 + k] * pm2[k * w + j];
pmresult[i * w + j] = sum; // result[i, j]
// Get the pointers and iterate fo each row
Gpu.Default.For(0, h * w, Kernel);
// Return the result
return Gpu.Copy2DToHost(mresult_device);
的差异public static double[,] MultiplyGpuManaged([NotNull] this double[,] m1, [NotNull] double[,] m2)
// Checks
if (m1.GetLength(1) != m2.GetLength(0)) throw new ArgumentOutOfRangeException("Invalid matrices sizes");
// Initialize the parameters and the result matrix
int h = m1.GetLength(0);
int w = m2.GetLength(1);
int l = m1.GetLength(1);
m1_gpu = Gpu.Default.Allocate(m1),
m2_gpu = Gpu.Default.Allocate(m2),
mresult_gpu = Gpu.Default.Allocate<double>(h, w);
// Execute the multiplication in parallel
Gpu.Default.For(0, h * w, index =>
// Calculate the current indexes
i = index / w,
j = index % w;
// Perform the multiplication
double sum = 0;
for (int k = 0; k < l; k++)
sum += m1_gpu[i, k] * m2_gpu[k, j];
mresult_gpu[i, j] = sum;
// Free memory and copy the result back
double[,] result = Gpu.CopyToHost(mresult_gpu);
return result;
public static double[,] MultiplyOnCPU([NotNull] this double[,] m1, [NotNull] double[,] m2)
// Checks
if (m1.GetLength(1) != m2.GetLength(0)) throw new ArgumentOutOfRangeException("Invalid matrices sizes");
// Initialize the parameters and the result matrix
int h = m1.GetLength(0);
int w = m2.GetLength(1);
int l = m1.GetLength(1);
double[,] result = new double[h, w];
Parallel.For(0, h * w, index =>
fixed (double* presult = result, pm1 = m1, pm2 = m2)
// Calculate the current indexes
i = index / w,
j = index % w;
// Perform the multiplication
double sum = 0;
int im1 = i * l;
for (int k = 0; k < l; k++)
sum += pm1[im1 + k] * pm2[k * w + j];
presult[i * w + j] = sum;
return result;
答案 0 :(得分:0)
事实证明,问题是由gpu用于分配2D数组的方法引起的 - 而不是像标准.NET数组那样使用单个连续内存块,但出于性能原因,它在每行的末尾添加了一些填充。
寻址2D gpu阵列的正确方法是使用间距,它表示每行的有效宽度(列+填充)。
这是一个工作代码示例,它只是填充2D gpu数组并将其复制回主机:
const int size = 10;
double[,] matrix_gpu;
using (DeviceMemory2D<double> m_gpu = Gpu.Default.AllocateDevice<double>(size, size))
deviceptr<double> ptr = m_gpu.Ptr;
int pitch = m_gpu.PitchInElements.ToInt32();
Gpu.Default.For(0, size, i =>
for (int j = 0; j < size; j++)
ptr[i * pitch + j] = i * size + j;
matrix_gpu = Gpu.Copy2DToHost(m_gpu);