我有一个矩阵,我想使用CUDA并以最快的方式计算列式均值(简化为简单的总和),即返回一个行向量,其中包含每列的平均值矩阵。用于计算单列向量之和的总和减少实现如下所示:
template<typename T>
__global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
extern __shared__ T sdata[];
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
T x = 0.0;
if (tid < n) {
x = input[tid];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if(threadIdx.x < offset) {
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0) {
per_block_results[blockIdx.x] = sdata[0];
}
}
,这被调用为:
int n = ... // vector size
const int BLOCK_SIZE = 1024;
int number_of_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
double* per_block_results = NULL;
cudaMalloc((void**) &per_block_results, sizeof(double)*(number_of_blocks + 1));
// launch one kernel to compute, per-block, a partial sum
kernelSum<double> <<<number_of_blocks, BLOCK_SIZE, BLOCK_SIZE*sizeof(double)>>>(a, per_block_results, n);
// launch a single block to compute the sum of the partial sums
kernelSum<double> <<<1, number_of_blocks, number_of_blocks*sizeof(double)>>>(per_block_results, per_block_results + number_of_blocks, number_of_blocks);
我可以将这个内核概括为任意数量的列的矩阵,但我受共享内存的限制。我的GPU具有计算能力3.5
,因此它具有48KB
共享内存和最大块大小1024
,即每个块的线程数。由于我对双精度感兴趣,因此我有48*1024/8= 6144
个最大双倍的共享内存。由于每个块都进行了减少,因此我可以有最多6144 (doubles in shared memory) / 1024 (block size) = 6
列,我可以同时计算减少的总和。然后减小块大小将允许同时计算更多列,例如6144 (doubles in shared memory) / 512 (block size) = 12
。
这种更复杂的策略是否会超过矩阵每列的简单CPU循环并调用总和减少量。还有另一种更好的方法吗?
答案 0 :(得分:3)
什么阻止你做这样的事情:
template<typename T>
__global__ void kernelSum(const T* __restrict__ input,
T* __restrict__ per_block_results,
const size_t lda, const size_t n)
{
extern __shared__ T sdata[];
// Accumulate per thread partial sum
T x = 0.0;
T * p = &input[blockIdx.x * lda];
for(int i=threadIdx.x; i < n; i += blockDim.x) {
x += p[i];
}
// load partial sum into __shared__ memory
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if(threadIdx.x < offset) {
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0) {
per_block_results[blockIdx.x] = sdata[0];
}
}
[标准免责声明:用浏览器编写,从未编译或测试,使用风险自负]
即。对于共享内存缩减,块中的每个线程只需要sdata
中的一个条目。每个线程总和所需的值以覆盖整列输入。然后没有共享内存限制,您可以使用相同的块大小对任何大小的列求和。
编辑:显然使用每个线程的部分总和的想法对你来说是新的,所以这里有一个完整的例子来研究:
#include <iostream>
template<typename T>
__global__
void kernelSum(const T* __restrict__ input,
const size_t lda, // pitch of input in words of sizeof(T)
T* __restrict__ per_block_results,
const size_t n)
{
extern __shared__ T sdata[];
T x = 0.0;
const T * p = &input[blockIdx.x * lda];
// Accumulate per thread partial sum
for(int i=threadIdx.x; i < n; i += blockDim.x) {
x += p[i];
}
// load thread partial sum into shared memory
sdata[threadIdx.x] = x;
__syncthreads();
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if(threadIdx.x < offset) {
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0) {
per_block_results[blockIdx.x] = sdata[0];
}
}
int main(void)
{
const int m = 10000, n = 16;
float * a = new float[m*n];
for(int i=0; i<(m*n); i++) { a[i] = (float)(i%10); }
float *a_;
size_t size_a = m * n * sizeof(float);
cudaMalloc((void **)&a_, size_a);
cudaMemcpy(a_, a, size_a, cudaMemcpyHostToDevice);
float *b_;
size_t size_b = n * sizeof(float);
cudaMalloc((void **)&b_, size_b);
// select number of warps per block according to size of the
// colum and launch one block per column. Probably makes sense
// to have at least 4:1 column size to block size
dim3 blocksize(256);
dim3 gridsize(n);
size_t shmsize = sizeof(float) * (size_t)blocksize.x;
kernelSum<float><<<gridsize, blocksize, shmsize>>>(a_, b_, m, m);
float * b = new float[n];
cudaMemcpy(b, b_, size_b, cudaMemcpyDeviceToHost);
for(int i=0; i<n; i++) {
std::cout << i << " " << b[i] << std::endl;
}
cudaDeviceReset();
return 0;
}
您应该尝试相对于矩阵大小的块大小以获得最佳性能,但通常内核每个线程的工作量越多,整体性能就越好(因为共享内存减少非常昂贵)。您可以在this answer中看到一种阻止和网格大小启发式方法,以解决类似的内存带宽限制问题。
答案 1 :(得分:2)
作为talonmies已经提供的答案的替代方案,我在此处报告了4
其他方法,用于减少列,3
基于使用CUDA Thrust和基于1
的方法正如我在上面的评论中所建议的那样,使用cublas<t>gemv()
列1
&#39;
CUDA推力方法类似于Reduce matrix rows with CUDA,通过
获得隐式转置thrust::make_permutation_iterator(d_matrix.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0),
(_1 % Nrows) * Ncols + _1 / Nrows))
以下是完整代码:
#include <cublas_v2.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <thrust/sequence.h>
#include <stdio.h>
#include <iostream>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
using namespace thrust::placeholders;
// --- Required for approach #2
__device__ float *vals;
/**************************************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */
/**************************************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {
T Ncols; // --- Number of columns
__host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
__host__ __device__ T operator()(T i) { return i / Ncols; }
};
/******************************************/
/* ROW_REDUCTION - NEEDED FOR APPROACH #2 */
/******************************************/
struct col_reduction {
const int Nrows; // --- Number of rows
const int Ncols; // --- Number of cols
col_reduction(int _Nrows, int _Ncols) : Nrows(_Nrows), Ncols(_Ncols) {}
__device__ float operator()(float& x, int& y ) {
float temp = 0.f;
for (int i = 0; i<Nrows; i++) {
temp += vals[y + (i*Ncols)];
}
return temp;
}
};
/**************************/
/* NEEDED FOR APPROACH #3 */
/**************************/
template<typename T>
struct MulC: public thrust::unary_function<T, T>
{
T C;
__host__ __device__ MulC(T c) : C(c) { }
__host__ __device__ T operator()(T x) { return x * C; }
};
/********/
/* MAIN */
/********/
int main()
{
const int Nrows = 5; // --- Number of rows
const int Ncols = 8; // --- Number of columns
// --- Random uniform integer distribution between 10 and 99
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(10, 99);
// --- Matrix allocation and initialization
thrust::device_vector<float> d_matrix(Nrows * Ncols);
for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng);
TimingGPU timerGPU;
/***************/
/* APPROACH #1 */
/***************/
timerGPU.StartCounter();
// --- Allocate space for row sums and indices
thrust::device_vector<float> d_col_sums(Ncols);
thrust::device_vector<int> d_col_indices(Ncols);
// --- Compute row sums by summing values with equal row indices
thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
thrust::make_permutation_iterator(
d_matrix.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
d_col_indices.begin(),
d_col_sums.begin(),
thrust::equal_to<int>(),
thrust::plus<float>());
//thrust::reduce_by_key(
// thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)),
// thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
// thrust::make_permutation_iterator(
// d_matrix.begin(),
// thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
// thrust::make_discard_iterator(),
// d_col_sums.begin());
printf("Timing for approach #1 = %f\n", timerGPU.GetCounter());
// --- Print result
for(int j = 0; j < Ncols; j++) {
std::cout << "[ ";
for(int i = 0; i < Nrows; i++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "] = " << d_col_sums[j] << "\n";
}
/***************/
/* APPROACH #2 */
/***************/
timerGPU.StartCounter();
thrust::device_vector<float> d_col_sums_2(Ncols, 0);
float *s_vals = thrust::raw_pointer_cast(&d_matrix[0]);
gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *)));
thrust::transform(d_col_sums_2.begin(), d_col_sums_2.end(), thrust::counting_iterator<int>(0), d_col_sums_2.begin(), col_reduction(Nrows, Ncols));
printf("Timing for approach #2 = %f\n", timerGPU.GetCounter());
for(int j = 0; j < Ncols; j++) {
std::cout << "[ ";
for(int i = 0; i < Nrows; i++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "] = " << d_col_sums_2[j] << "\n";
}
/***************/
/* APPROACH #3 */
/***************/
timerGPU.StartCounter();
thrust::device_vector<float> d_col_sums_3(Ncols, 0);
thrust::device_vector<float> d_temp(Nrows * Ncols);
thrust::inclusive_scan_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
thrust::make_permutation_iterator(
d_matrix.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
d_temp.begin());
thrust::copy(
thrust::make_permutation_iterator(
d_temp.begin() + Nrows - 1,
thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))),
thrust::make_permutation_iterator(
d_temp.begin() + Nrows - 1,
thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))) + Ncols,
d_col_sums_3.begin());
printf("Timing for approach #3 = %f\n", timerGPU.GetCounter());
for(int j = 0; j < Ncols; j++) {
std::cout << "[ ";
for(int i = 0; i < Nrows; i++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "] = " << d_col_sums_3[j] << "\n";
}
/***************/
/* APPROACH #4 */
/***************/
cublasHandle_t handle;
timerGPU.StartCounter();
cublasSafeCall(cublasCreate(&handle));
thrust::device_vector<float> d_col_sums_4(Ncols);
thrust::device_vector<float> d_ones(Nrows, 1.f);
float alpha = 1.f;
float beta = 0.f;
cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, Ncols, Nrows, &alpha, thrust::raw_pointer_cast(d_matrix.data()), Ncols,
thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_col_sums_4.data()), 1));
printf("Timing for approach #4 = %f\n", timerGPU.GetCounter());
for(int j = 0; j < Ncols; j++) {
std::cout << "[ ";
for(int i = 0; i < Nrows; i++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "] = " << d_col_sums_4[j] << "\n";
}
return 0;
}