Question

我正在尝试使用OpenCL加速某些计算，部分算法包括反转矩阵。是否有任何开源库或免费提供的代码来计算用OpenCL或CUDA编写的矩阵或一般反演的lu分解（lapack dgetrf和dgetri）？矩阵是实数和正方形，但除此之外没有任何其他特殊属性。到目前为止，我已经设法在gpu上找到了基本的blas矩阵向量运算实现。

矩阵相当小，只有大约60-100行和cols，所以它可以在cpu上更快地计算，但它在算法的中间使用了，所以我必须将它转移到主机，计算反转，然后将结果传回设备，然后在更大的计算中使用它。

Answer 1

看看ViennaCL：http://viennacl.sourceforge.net/

Answer 2

我在Open CL中没有实现，但"Numerical Recipes"和Gil Strang的"Into to Applied Math"都有很好的解释，很容易编码。 “NR”具有您可以适应的C代码。

计算逆

这是不正确的。你没有计算LU分解的逆，你正在分解矩阵。如果你想要逆，你必须用一系列单位向量进行前向替换。这是一个小但重要的区别。

Answer 3

检查CULA

http://www.culatools.com/ http://www.culatools.com/versions/basic

Answer 4

我知道这有点晚了，但是如果你试图在一个小的（60-100行）矩阵上进行任何矩阵计算，那么计算在CPU上要快得多，而不是GPU因为将信息从主存储器复制到GPU的内存所花费的时间。如果您想要这样做，那么我建议使用OpenMP或MPI等并行语言，因为这些允许您并行化代码以加速CPU上的计算。

Answer 5

我使用本征库使用多线程在CPU上进行微积分最多2k x 2k，所以它现在比使用一个线程快3.5-3.65倍（取决于矩阵大小）。我使用了Intel Xeon 3.5Ghz E5-1620 v3处理器和16Gb ram。（不幸的是，我删除了旧版本以添加精确值，但是如果有优先级，我可以重写sw）

这是我用来比较的矩阵逆算法。（这是正确的，因为我说过要再次进行很多测试才能获得更好的结果）：

/*
Uses 2D arrays.
Main routines are:
init_2Dvector() that initializes any 2d vector (can be uchar, char, int, float or double)
multiply_2Dvector()
inverse()
*/

#include<iostream>
#include <vector>
#include <stdlib.h>
#include <time.h>

using namespace std;

/*
void print_2Dvector(vector<vector<double> >& vec)
{
    size_t xmax, ymax;
    ymax = vec.size();
    xmax = vec[0].size(); 

    int x, y;
    for (y = 0; y < ymax; y++)
    {
        for (x = 0; x < xmax; x++)
            cout << vec[y][x] << " \t";
        cout << endl;
    }
}*/

void print_2Dvector(vector<vector<double> >& vec,char *format="%lg \t")
{
    size_t xmax, ymax;
    ymax = vec.size();
    xmax = vec[0].size();

    int x, y;
    for (y = 0; y < ymax; y++)
    {
        {
            for (x = 0; x < xmax; x++)
                printf(format, vec[y][x]);
        }
        cout << endl;
    }
}

//Resizes to y_dim,x_dim any kind of 2d array:
template<typename T>
void init_2Dvector(vector<vector<T> >& vec, size_t y_dim, size_t x_dim)
{
    vec.resize(y_dim);
    for (size_t i = 0; i < vec.size(); i++)
        vec[i].resize(x_dim);
}
//Returns vec1*vec2. vec1 and 2 are not touch
vector< vector<double> > multiply_2Dvector(vector< vector<double> > vec1, vector< vector<double> > vec2)
{
    size_t xmax, ymax;
    ymax = vec1.size();   
    xmax = vec1[0].size();
    vector< vector<double> > vec3;
    if ((ymax != vec2[0].size()) || (xmax != vec2.size()))
    {
        cout << "ERROR on dim2_multiply() dimensions of vec2 not corresponding with vec1 ones" << endl; getchar(); return{};//returns a null
    }
    init_2Dvector(vec3, ymax, ymax);
    cout << "dimensions of vec3=" << vec3.size() << " x " << vec3[0].size() << endl;
    double xx;
    for (int y = 0; y < ymax; y++)
        for (int x = 0; x < ymax; x++)
        {
            xx = 0.0;
            for (int t = 0; t < xmax; t++)
                xx += vec1[y][t] * vec2[t][x];
            vec3[y][x] = xx;
        }
    return vec3;//ok
}

//returns inverse of x2, x2 is not modified
vector< vector<double> > inverse(vector< vector<double> > x)
{
    if (x.size() != x[0].size())
    {
        cout << "ERROR on inverse() not square array" << endl; getchar(); return{};//returns a null
    }

    size_t dim = x.size();
    int i, j, ord;
    vector< vector<double> > y(dim,vector<double>(dim));//output
    //init_2Dvector(y, dim, dim);
    //1. Unity array y: 
    for (i = 0; i < dim; i++)
    {
        y[i][i] = 1.0;
        for (j = i+1; j < dim; j++)
        {
            y[i][j]= y[j][i] = 0.0;
        }
    }

    double diagon, coef;
    double *ptrx, *ptry, *ptrx2, *ptry2;
    for (ord = 0; ord<dim; ord++)
    {
        //2 Hacemos diagonal de x =1
        int i2;
        if (fabs(x[ord][ord])<1e-15) //Si el elemento diagonal es 0 sumamos una columna que no sea 0 el elemento correspondiente
        {
            for (i2 = ord + 1; i2<dim; i2++)
            {
                if (fabs(x[i2][ord])>1e-15) break;
            }
            if (i2 >= dim)
                return{};//error, returns null
            for (i = 0; i<dim; i++)//sumo la linea que no es 0 el de la misma fila de ord
            {
                x[ord][i] += x[i2][i];
                y[ord][i] += y[i2][i];
            }
        }
        diagon = 1.0/x[ord][ord];
        ptry = &y[ord][0];
        ptrx = &x[ord][0];
        for (i = 0; i < dim; i++)
        {
            *ptry++ *= diagon;
            *ptrx++ *= diagon;
        }

        //Hacemos '0' la columna ord salvo elemento diagonal:
        for (i = 0; i<dim; i++)//Empezamos por primera fila
        {
            if (i == ord) continue;
            coef = x[i][ord];//elemento ha hacer 0 
            if (fabs(coef)<1e-15) continue; //si es cero se evita
            ptry = &y[i][0];
            ptry2 = &y[ord][0];
            ptrx = &x[i][0];
            ptrx2 = &x[ord][0];
            for (j = 0; j < dim; j++)
            {
                *ptry++ = *ptry - coef * (*ptry2++);//1ª matriz
                *ptrx++ = *ptrx - coef * (*ptrx2++);//2ª matriz
            }
        }
    }//end ord
    return y;
}


void test_5_inverse()
{
    vector< vector<double> > vec1 = {
        {0,-5,0,7,33,11,-1},
        {72,0,-11,7,9,33,5 },
        {-13,31,-5,15,29,30,24 },
        {-24,9,8,-23,31,-12,4 },
        {-3,-22,4,-24,-5,27,-10 },
        {-10,-21,-16,-32,-11,20,14 },
        {5,30,13,-32,29,-13,-13 }
    };
    vector< vector<double> > vec2;
    vec2 = inverse(vec1);
    vector< vector<double> > vec3;
    vec3 = multiply_2Dvector(vec1, vec2);

    cout << "initial array (must be unmodified):" << endl;
    print_2Dvector(vec1);

    cout << "Must be diagon array:" << endl;
    print_2Dvector(vec3," %8.3lf");
    cout << endl;
}


void test_6_inverse(int dim)
{
    vector< vector<double> > vec1(dim, vector<double>(dim));
    for (int i=0;i<dim;i++)
        for (int j = 0; j < dim; j++)
        {
            vec1[i][j] = (-1.0 + 2.0*rand() / RAND_MAX) * 10000;
        }

    vector< vector<double> > vec2;
    double ini, end;
    ini = (double)clock();
    vec2 = inverse(vec1);
    end = (double)clock();
    cout << "Time inverse =" << (end - ini) / CLOCKS_PER_SEC << endl;
    vector< vector<double> > vec3;
    vec3 = multiply_2Dvector(vec1, vec2);

    cout << "initial array (must be unmodified):" << endl;
    //print_2Dvector(vec1);

    cout << "Must be diagon array:" << endl;
    //print_2Dvector(vec3, " %8.3lf");
    cout << endl;
}

int main()
{
    vector< vector<double> > vec1;
    init_2Dvector(vec1, 10, 5);    //size_t ymax = vec1.size(),xmax = vec1[0].size();
    //test_2_dimension_vector();
    //test_one_dimension_vector();
    test_5_inverse();
    test_6_inverse(1300);
    cout << endl << "=== END ===" << endl; getchar(); 
    return 1;
}

Answer 6

原始问题（现在7岁）实际上是在4年后的paper describing matrix inversion in CUDA based on Gauss-Jordan中解决的。它尝试在不同的线程上分配计算，并为大小为2048的矩阵提供详细的性能指示。

虽然不是OpenCL，但一般的想法很容易从CUDA转化。

OpenCL中的矩阵求逆

6 个答案: