Question

我最近观看了CppCon关于使用Clang编译CUDA cuda代码的讨论，其中演讲者在谈了一些关于架构的实现了减少总和之后。我感兴趣的是他采用的方法是通过块中元素的shfl进行减少，所以没有工作示例我使用他的代码修改了一点并获得了最大减少。

事实是，这个最大缩减非常慢，相比于在2 ^ 22个元素中找到最大值的CPU实现，我得到的时间约为〜90ms，大约20ms。这是shfl减少的代码

#include <vector>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>

using namespace std;

// Global reduce test
__global__ void d_max_reduce(const int *in, int *out, size_t N) {
    int sum = 0;
    size_t start = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
    for (size_t i = start; i < start + 4 && i < N; i++) {
        sum = max(__ldg(in + i), sum);
    }

    for (int i = 16; i; i >>= 1) {
        sum = max(__shfl_down(sum, i), sum);
    }

    __shared__ int shared_max;
    shared_max = 0;
    __syncthreads();

    if (!(threadIdx.x % 32)) {
        atomicMax(&shared_max, sum);
    }

    __syncthreads();

    if (!threadIdx.x) {
        atomicMax(out, shared_max);
    }
}

int test_max_reduce(std::vector<int> &v) {
    int *in, *out;

    cudaMalloc(&in, v.size() * sizeof(int));
    cudaMalloc(&out, sizeof(int));
    cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemset(out, 0, sizeof(int));

    int threads = 256;
    d_max_reduce<<<ceil((float)v.size() / (threads * 4)), threads>>>(in, out, v.size());

    int res;
    cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(in);
    cudaFree(out);
    return res;
}

所以我使用了Nvidia的一个步幅减少的例子（也是一个总和）将其改为最大值，我得到了大约7ms的时间。这是跨步减少的代码

#include <vector>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>

__global__ void d_max_reduction(const int *in, int *out, size_t N) {
    extern __shared__ int s_data[];

    size_t tid = threadIdx.x;
    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        s_data[tid] = in[i];
    else
        s_data[tid] = 0;
    __syncthreads();

    for (size_t s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s)
            s_data[tid] = max(s_data[tid], s_data[tid + s]);
        __syncthreads();
    }

    if (!tid)
        atomicMax(out, s_data[0]);
}

int test_max_reduction(std::vector<int> &v) {
    int *in;
    int *out;

    cudaMalloc(&in, v.size() * sizeof(int));
    cudaMalloc(&out, sizeof(int));
    cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemset(out, 0, sizeof(int));

    int threads = 128;

    d_max_reduction<<<ceil((float)v.size() / threads),
                      threads,
                      threads * sizeof(int)>>>(in, out, v.size());

    int res;
    cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(in);
    cudaFree(out);

    return res;
}

以防万一，所以有一个MWE。

#include <random>
#include <timer.hpp>

int test_max_reduce(std::vector<int> &v);
int test_max_reduction(std::vector<int> &v);

int main() {
    int N = 2000 * 2000; // * 2000;
    std::vector<int> vec(N);

    std::random_device dev;
    std::mt19937 mt(dev());
    std::uniform_int_distribution<int> dist(0, N << 2);

    for (size_t i = 0; i < vec.size(); i++) {
        vec[i] = dist(mt);
    }


    measure("GPU (shfl)", test_max_reduce, vec);
    measure("GPU strided", test_max_reduction, vec);
    measure("CPU",
            [](std::vector<int> &vec) -> int {
                int maximum = 0;
                for (size_t i = 0; i < vec.size(); i++) {
                    maximum = std::max(maximum, vec[i]);
                }
                return maximum;
            },
            vec);

    return 0;
}

而且timer.hpp是

#ifndef TIMER_HPP
#define TIMER_HPP

#include <chrono>
#include <string>
#include <iostream>

template <typename F, typename ...Args>
void measure(std::string msg, F func, Args&&... args) {
    auto start = std::chrono::steady_clock::now();
    int val = func(std::forward<Args>(args)...);
    auto end = std::chrono::steady_clock::now();

    std::cout << msg << " Test " << std::endl;
    std::cout << " Max Value : " << val << std::endl;
    std::cout << " Time      : ";
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>
                 (end - start).count() << std::endl;
}

#endif // TIMER_HPP

我通常会得到以下时间

GPU (shfl) Test 
 Max Value : 15999999
 Time      : 86
GPU strided Test 
 Max Value : 15999999
 Time      : 7
CPU Test 
 Max Value : 15999999
 Time      : 23

预热后编辑新时间

GPU (shfl) Test 
 Max Value : 16000000
 Time      : 4
GPU strided Test 
 Max Value : 16000000
 Time      : 6
CPU Test 
 Max Value : 16000000
 Time      : 23

所以我更普遍的问题是为什么shfl版本比strided慢？

可分为

我是否在启动参数/做/假设出错了？

何时建议在跨步循环中使用shfl内在而反之？

Strided vs shuffling减少

0 个答案: