我最近观看了CppCon关于使用Clang编译CUDA cuda代码的讨论,其中演讲者在谈了一些关于架构的实现了减少总和之后。我感兴趣的是他采用的方法是通过块中元素的shfl进行减少,所以没有工作示例我使用他的代码修改了一点并获得了最大减少。
事实是,这个最大缩减非常慢,相比于在2 ^ 22个元素中找到最大值的CPU实现,我得到的时间约为〜90ms,大约20ms。这是shfl减少的代码
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
using namespace std;
// Global reduce test
__global__ void d_max_reduce(const int *in, int *out, size_t N) {
int sum = 0;
size_t start = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
for (size_t i = start; i < start + 4 && i < N; i++) {
sum = max(__ldg(in + i), sum);
}
for (int i = 16; i; i >>= 1) {
sum = max(__shfl_down(sum, i), sum);
}
__shared__ int shared_max;
shared_max = 0;
__syncthreads();
if (!(threadIdx.x % 32)) {
atomicMax(&shared_max, sum);
}
__syncthreads();
if (!threadIdx.x) {
atomicMax(out, shared_max);
}
}
int test_max_reduce(std::vector<int> &v) {
int *in, *out;
cudaMalloc(&in, v.size() * sizeof(int));
cudaMalloc(&out, sizeof(int));
cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(out, 0, sizeof(int));
int threads = 256;
d_max_reduce<<<ceil((float)v.size() / (threads * 4)), threads>>>(in, out, v.size());
int res;
cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(in);
cudaFree(out);
return res;
}
所以我使用了Nvidia的一个步幅减少的例子(也是一个总和)将其改为最大值,我得到了大约7ms的时间。这是跨步减少的代码
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
__global__ void d_max_reduction(const int *in, int *out, size_t N) {
extern __shared__ int s_data[];
size_t tid = threadIdx.x;
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
s_data[tid] = in[i];
else
s_data[tid] = 0;
__syncthreads();
for (size_t s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s)
s_data[tid] = max(s_data[tid], s_data[tid + s]);
__syncthreads();
}
if (!tid)
atomicMax(out, s_data[0]);
}
int test_max_reduction(std::vector<int> &v) {
int *in;
int *out;
cudaMalloc(&in, v.size() * sizeof(int));
cudaMalloc(&out, sizeof(int));
cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(out, 0, sizeof(int));
int threads = 128;
d_max_reduction<<<ceil((float)v.size() / threads),
threads,
threads * sizeof(int)>>>(in, out, v.size());
int res;
cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(in);
cudaFree(out);
return res;
}
以防万一,所以有一个MWE。
#include <random>
#include <timer.hpp>
int test_max_reduce(std::vector<int> &v);
int test_max_reduction(std::vector<int> &v);
int main() {
int N = 2000 * 2000; // * 2000;
std::vector<int> vec(N);
std::random_device dev;
std::mt19937 mt(dev());
std::uniform_int_distribution<int> dist(0, N << 2);
for (size_t i = 0; i < vec.size(); i++) {
vec[i] = dist(mt);
}
measure("GPU (shfl)", test_max_reduce, vec);
measure("GPU strided", test_max_reduction, vec);
measure("CPU",
[](std::vector<int> &vec) -> int {
int maximum = 0;
for (size_t i = 0; i < vec.size(); i++) {
maximum = std::max(maximum, vec[i]);
}
return maximum;
},
vec);
return 0;
}
而且timer.hpp是
#ifndef TIMER_HPP
#define TIMER_HPP
#include <chrono>
#include <string>
#include <iostream>
template <typename F, typename ...Args>
void measure(std::string msg, F func, Args&&... args) {
auto start = std::chrono::steady_clock::now();
int val = func(std::forward<Args>(args)...);
auto end = std::chrono::steady_clock::now();
std::cout << msg << " Test " << std::endl;
std::cout << " Max Value : " << val << std::endl;
std::cout << " Time : ";
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>
(end - start).count() << std::endl;
}
#endif // TIMER_HPP
我通常会得到以下时间
GPU (shfl) Test
Max Value : 15999999
Time : 86
GPU strided Test
Max Value : 15999999
Time : 7
CPU Test
Max Value : 15999999
Time : 23
预热后编辑新时间
GPU (shfl) Test
Max Value : 16000000
Time : 4
GPU strided Test
Max Value : 16000000
Time : 6
CPU Test
Max Value : 16000000
Time : 23
所以我更普遍的问题是为什么shfl版本比strided慢?
可分为我是否在启动参数/做/假设出错了?
何时建议在跨步循环中使用shfl内在而反之?