Thrust copy - OutputIterator列主要订单

时间:2014-01-08 08:53:15

标签: cuda thrust

我有一个矩阵向量(存储为列主要数组),我想垂直连接。因此,我想利用推力框架中的复制功能,如以下示例代码段所示:

int offset = 0;
for(int i = 0; i < matrices.size(); ++i) {
    thrust::copy(
        thrust::device_ptr<float>(matrices[i]),
        thrust::device_ptr<float>(matrices[i]) + rows[i] * cols[i],
        thrust::device_ptr<float>(result) + offset
    );

    offset += rows[i] * cols[i];
}

编辑:扩展示例:

问题是,如果我有一个矩阵A = [[1,2,3],[4,5,6]](2行,3个cols;在内存中[1,4,2,5, 3,6]和另一个B = [[7,8,9]](1行,3列;在记忆[7,8,9]中),得到的矩阵C不是[[1,2,3] ,[4,5,6],[7,8,9]](3行,3列;记忆[1,4,7,2,5,8,3,6,9]),但[[ 1,5,7],[4,3,8],[2,6,9]](3行,3列;记忆[1,4,2,5,3,6,7,8,9] ])。

有没有办法为这个问题创建一个特殊的OutputIterator(我搜索过它,但什么都没找到),或者是一种快速替代方法?

编辑:SSCCE

#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/device_vector.h>
#include <iostream>

void printMat2d(thrust::device_vector<float>& mat, int rows, int cols) {
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            std::cout << mat[row + col * rows] << " ";
        }
        std::cout << std::endl;
    }
}

void printMat1d(thrust::device_vector<float>& mat, int rows, int cols) {
    for(int idx = 0; idx < cols*rows; ++idx) {
            std::cout << mat[idx] << " ";
    }
    std::cout << std::endl;
}

void generateMat(thrust::device_vector<float>& mat, int rows, int cols, int add) {
    thrust::host_vector<float> matHost(rows * cols);
    int val = 0;
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            matHost[row + col * rows] = val + add;
            val++;
        }
    }
    mat = matHost;
}

int main() {
    std::vector<int> rows(2);
    rows[0] = 2;
    rows[1] = 3;
    std::vector<int> cols(2);
    cols[0] = 3;
    cols[1] = 3;

    //generate matrices
    std::vector<thrust::device_vector<float> > matrices(2);
    for(size_t i = 0; i < matrices.size(); ++i) {
        generateMat(matrices[i], rows[i], cols[i], i*10);

        std::cout << "mat_ " << i << " = " << std::endl;
        printMat2d(matrices[i], rows[i], cols[i]);
        printMat1d(matrices[i], rows[i], cols[i]);
    }

    //copy
    int resultRows = 5;
    int resultCols = 3;
    thrust::device_vector<float> result(resultRows * resultCols);
    int offset = 0;
    for(int i = 0; i < matrices.size(); ++i) {
        thrust::copy(
            matrices[i].begin(),
            matrices[i].end(),
            result.begin() + offset
        );

        offset += rows[i] * cols[i];
    }

    std::cout << "result = " << std::endl;
    printMat2d(result, resultRows, resultCols);
    printMat1d(result, resultRows, resultCols);

    return 0;
}

1 个答案:

答案 0 :(得分:1)

编辑:我已经取代了我之前使用每行跨距方法的答案,采用略有不同的方法,将复制操作降低到单个推力调用(每个矩阵)被复制)。

这里的关键思想是使用一个将行主内存索引转换为列主内存索引的仿函数。然后,可以将此仿函数与counting_iterator一起使用,以创建任意行主要到列主要内存索引(通过make_transform_iterator)。然后,这些索引可以在permutation_iterator中用于源矩阵,以选择要复制的元素,并使用permutation_iterator作为目标矩阵,以选择要复制到的内存位置。有关transform_iteratorcounting_iteratorpermutation_iterator的一般性审核,请参阅thrust quick start guide。我碰巧使用CUDA 5.0并推测1.5.3用于此练习。

#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <iostream>

struct rm2cm_idx_functor : public thrust::unary_function<int, int>
{
  int r;
  int c;

  rm2cm_idx_functor(int _r, int _c) : r(_r), c(_c) {};

  __host__ __device__
  int operator() (int idx)  {
    unsigned my_r = idx/c;
    unsigned my_c = idx%c;
    return (my_c * r) + my_r;
  }
};

typedef float my_type;


void printMat2d(thrust::device_vector<my_type>& mat, int rows, int cols) {
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            std::cout << mat[row + col * rows] << " ";
        }
        std::cout << std::endl;
    }
}

void printMat1d(thrust::device_vector<my_type>& mat, int rows, int cols) {
    for(int idx = 0; idx < cols*rows; ++idx) {
            std::cout << mat[idx] << " ";
    }
    std::cout << std::endl;
}

void generateMat(thrust::device_vector<my_type>& mat, int rows, int cols, int add) {
    thrust::host_vector<my_type> matHost(rows * cols);
    int val = 0;
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            matHost[row + col * rows] = val + add;
            val++;
        }
    }
    mat = matHost;
}


void copyMat(thrust::device_vector<my_type>& src, thrust::device_vector<my_type>& dst, unsigned src_rows, unsigned src_cols, unsigned dst_rows, unsigned offset){
   thrust::copy_n(thrust::make_permutation_iterator(src.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), rm2cm_idx_functor(src_rows, src_cols))), src_rows*src_cols, thrust::make_permutation_iterator(dst.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(offset), rm2cm_idx_functor(dst_rows, src_cols))));
}



int main() {
    std::vector<int> rows(2);
    rows[0] = 2;
    rows[1] = 3;
    std::vector<int> cols(2);
    cols[0] = 3;
    cols[1] = 3;

    //generate matrices
    std::vector<thrust::device_vector<my_type> > matrices(2);
    for(size_t i = 0; i < matrices.size(); ++i) {
        generateMat(matrices[i], rows[i], cols[i], i*10);

        std::cout << "mat_ " << i << " = " << std::endl;
        printMat2d(matrices[i], rows[i], cols[i]);
        printMat1d(matrices[i], rows[i], cols[i]);
    }

    //copy
    int resultRows = 5;
    int resultCols = 3;
    thrust::device_vector<my_type> result(resultRows * resultCols);
    int offset = 0;

    for(int i = 0; i < matrices.size(); ++i) {
      copyMat(matrices[i], result, rows[i], cols[i], resultRows, offset);
      offset += rows[i]*cols[i];
    }


    std::cout << "result = " << std::endl;
    printMat2d(result, resultRows, resultCols);
    printMat1d(result, resultRows, resultCols);

    return 0;
}

这也假定源列==目标列,这似乎隐含在您的问题语句中。标准警告:不是说这是没有错误的,但它似乎适用于原始问题陈述中内置的测试用例。

这种方法可能仍有待进一步改进。此时,与thrust::copy_n调用关联的读取操作和写入操作都将被取消合并。我们可以通过使这两个操作中的一个合并来进一步改进这一点。这需要将读取和写入的索引转换函子的效果组合到单个映射函数中,该函数考虑源和目标维度。使用单个映射函子,copy_n调用的第一项可能只是源向量。我认为也应该可以使用thrust::gatherthrust::scatter。但是,我还没有完全解决它。