Question

问题

大小为M * N的矩阵A以行优先方式存储。我需要将每列最大点两侧的 L 元素放入矩阵B的每一列中。矩阵B的大小为（2L + 1）×N。假设索引永远不会超出边界。

示例

Original matrix
A = 2   4   7
    5   4   8
    3   12  10
    10  6   9
    1   7   11
    2   1   10
    6   0   1

Cut length L = 2

Result matrix
B = 5   4   10
    3   4   9
    10  12  11
    1   6   10
    2   7   1

我效率低下的解决方案

第1步：找到每列最大元素的索引。

我使用了Determining the least element and its position in each matrix column with CUDA Thrust

中提出的方法

第2步：将数据从A传输到B。

因为A以行优先方式存储，所以为了方便使用 thrust ，我首先将其转换为列优先方式。其次，我对“ for循环”中的每一列使用 thrust :: copy_n 并将此结果存储在临时矩阵中。最后，转置此临时矩阵以获得B。

在实际情况下，A的大小为256 * 20024，L =2048。A是一个复矩阵，所以最大元素表示具有最大模量的元素。

我的代码：

typedef thrust::complex<float> comThr;

// ---------------- struct for thrust ---------------- //
struct complex_abs: public thrust::unary_function<comThr, float>
{
    __host__ __device__
    float operator() (comThr a)
    {
        return thrust::abs(a);
    }
};

// A-> d_Data, B->d_DataCut
void CutMatrix(cuComplex *d_Data, cuComplex *d_DataCut, int rows, int cols, int L)
{
    // step 1: compute abs of every element
    // type convert(cuComplex->thrust)
    comThr *thr_d_temp_Data = reinterpret_cast<comThr*>(d_Data);
    thrust::device_ptr<comThr> thr_Data = thrust::device_pointer_cast(thr_d_temp_Data);
    thrust::device_vector<float> thr_dataAbs(rows*cols);
    // compute abs()
    complex_abs op_abs;
    thrust::transform(thrust::device,thr_Data,thr_Data+rows*cols,thr_dataAbs.begin(),op_abs);

    // step 2: get maximum value of every column
    thrust::device_vector<float>maxVal(rows);
    thrust::device_vector<int>maxIdx(rows);

    getMaxInColumns(thr_dataAbs,maxVal,maxIdx,cols,rows);  // refer the link

    // step 3: cut data size to rows*4097
    // transpose data to row major, transpose result store in d_tempData

    cuComplex *d_tempData;
    checkCudaErrors(cudaMalloc((void**)&d_tempData,sizeof(cuComplex)*rows*cols));
    checkCudaErrors(cudaMemset(d_tempData,0.0f,2*sizeof(float)*rows*cols));

    cublasHandle_t handle_TransData;
    checkCublasErrors(cublasCreate(&handle_TransData));

    cuComplex alpha_TransData;
    alpha_TransData.x = 1.0f;
    alpha_TransData.y = 0.0f;
    cuComplex beta_TransData;
    beta_TransData.x = 0.0f;
    beta_TransData.y = 0.0f;
    checkCublasErrors(cublasCgeam(handle_TransData,CUBLAS_OP_T,CUBLAS_OP_T,cols,rows,&alpha_TransData,
            d_Data,rows,&beta_TransData,d_Data,rows,d_tempData,cols));    // row major

    // transfer data
    // type convert(thrust->cuComplex)
    comThr *thr_d_temp_DataII = reinterpret_cast<comThr*>(d_tempData);
    thrust::device_ptr<comThr> thr_tempData = thrust::device_pointer_cast(thr_d_temp_DataII);

    thrust::device_vector<comThr> thr_DataCut(rows*(2*L+1)); // row major result store here

    thrust::host_vector<int>h_maxIdx(rows);
    thrust::copy(maxIdx.begin(),maxIdx.end(),h_maxIdx.begin());

    int offset;
    for(int ii=0;ii<rows;ii++){
        offset = ii*cols + h_maxIdx[ii] - L;
        thrust::copy_n(thr_tempData+offset,2*L+1,thr_DataCut.begin()+ii*(2*L+1));
    }

    // transpose data to column major and get result
    cuComplex *d_tempDataCut = reinterpret_cast<cuComplex*>(thrust::raw_pointer_cast(thr_DataCut.data()));
    checkCublasErrors(cublasCgeam(handle_TransData,CUBLAS_OP_T,CUBLAS_OP_T,rows,2*L+1,&alpha_TransData,
            d_tempDataCut,2*L+1,&beta_TransData,d_tempDataCut,2*L+1,d_DataCut,rows));    // column major

    checkCudaErrors(cudaFree(d_tempData));
}

在大小为256 * 20024的数据集（GPU Tesla M6）上，上述整个过程将耗时近1毫秒。

问题

是否有更好且简洁的方法来解决此问题？是否可以通过完全使用推力或完善的内核函数来解决？

谢谢！

使用CUDA根据每一列的最大元素索引来定制矩阵

0 个答案: