Question

我用C ++和CUDA编写的TensorFlow r1.5操作的一部分涉及减少Tensor。我已经实现了简单的交织缩减算法，如here所述。但是，似乎并没有减少整个缓冲区。

块减少的实现如下

template<typename T>
__global__
void blockReduceDevice(const T *buffer, T *out, size_t len) {
    const size_t tIdx = threadIdx.x;
    const size_t bIdx = blockIdx.x;
    const size_t bDim = blockDim.x;
    const size_t idx = bIdx * bDim + tIdx;

    //To allow templated, dynamic shared memory, we set the 
    //smem to be uchar and reinterpret as templated type.
    extern __shared__ __align__(sizeof(T)) unsigned char buffReduce[];
    __syncthreads();

    //Set contribution of this thread. 0 if out of bounds.
    T *reduce = reinterpret_cast<T*>(buffReduce);
    reduce[tIdx] = (idx >= len) ? 0.0 : buffer[idx];
    __syncthreads();

    //Block reduce.
#pragma unroll
    for (int i = bDim >> 1; i >= 1; i >>= 1) {
        if(tIdx < i) {
            reduce[tIdx] += reduce[tIdx + i];
        }
        __syncthreads();
    }

    if(tIdx == 0) {
        out[bIdx] = reduce[tIdx];
    }
}

上面的内核调用如下

template<typename T>
void testReduce(const T *buffer, T *blockVals, const GPUDevice &dev, size_t len) {
    //Get CUDA stream.
    const cudaStream_t &stream = dev.stream();

    //Get launch configuration for reduction operation.
    const auto reduceConfig = tensorflow::GetCudaLaunchConfig(len, dev);
    const size_t blocks = reduceConfig.block_count;
    const size_t threads = reduceConfig.thread_per_block;
    const size_t shared = threads * sizeof(T);


    //Reset buffer to known value.
    std::vector<T> knownValsHost(len, 1.0);
    cudaMemcpyAsync(buffer, &knownValsHost[0], len * sizeof(T), cudaMemcpyHostToDevice, stream);
    CUSAFE(cudaStreamSynchronize(stream));

    //Reset output to nought.
    std::vector<T> tmp(blocks, 0.0);
    cudaMemcpyAsync(blockVals, &tmp[0], blocks * sizeof(T), cudaMemcpyHostToDevice, stream);
    CUSAFE(cudaStreamSynchronize(stream));

    //Reduce on the GPU.
    blockReduceDevice<T><<<blocks, threads, shared, stream>>>(buffer, blockVals, len);
    CUSAFE(cudaPeekAtLastError());
    CUSAFE(cudaStreamSynchronize(stream));

    //Further reduce on the CPU.
    std::vector<T> blockValsHost(blocks, 0.0);
    cudaMemcpyAsync(&blockValsHost[0], blockVals, blocks * sizeof(T), cudaMemcpyDeviceToHost, stream);
    CUSAFE(cudaStreamSynchronize(stream));
    const T resGPU = std::accumulate(blockValsHost.begin(), blockValsHost.end(), static_cast<T>(0));

    //Get result when copying buffer to CPU memory and reducing.
    std::vector<T> bufferHost(len, 0.0);
    cudaMemcpyAsync(&bufferHost[0], buffer, len * sizeof(T), cudaMemcpyDeviceToHost, stream);
    CUSAFE(cudaStreamSynchronize(stream));
    const T resCPU = std::accumulate(bufferHost.begin(), bufferHost.end(), static_cast<T>(0));

    //Print some output for diagnostics.
    std::cout << "Length: " << len << std::endl;
    std::cout << "Num CUDA Blocks: " << blocks << std::endl;
    std::cout << "Num CUDA Threads Per Block: " << threads << std::endl;
    std::cout << "GPU Result: " << resGPU << std::endl;
    std::cout << "CPU Result: " << resCPU << std::endl;
}

在上面的测试用例中，给出了以下输出，其中所有缓冲区条目都设置为1.0

Length: 32768
Num CUDA Blocks: 10
Num CUDA Threads Per Block: 1024
GPU Result: 10240
CPU Result: 32768

可以看出，使用std::accumulate的CPU减少按预期工作（如len == resCPU）。这使我相信CUDA内核没有完全执行为blocks * threads != len。

TensorFlow文档指出here应该使用tensorflow/core/util/cuda_kernel_helper.h标头获取CUDA内核启动配置，可以找到here。

为什么TensorFlow会为我提供一个不执行适当线程数的启动配置？

我手动设置启动配置参数时也会收到类似的结果。

Answer 1

为什么TensorFlow会为我提供发布没有执行适当线程数的配置？

我猜是因为Tensorflow希望它运行的内核符合你的内核所没有的设计原则。执行参数Tensorflow返回将把线程数限制为理论上可以在给定设备上运行的最大并发线程数。有关详细信息，请参阅here。

你的工作是编写一个符合该设计模式的内核，基本上是能够处理每个线程的多个输入数据点。在实践中，这意味着将内核简单地修改为：

template<typename T>
__global__
void blockReduceDevice(const T *buffer, T *out, size_t len) {
    const size_t tIdx = threadIdx.x;
    const size_t bIdx = blockIdx.x;
    const size_t bDim = blockDim.x;
    const size_t idx = bIdx * bDim + tIdx;
    const size_t stride = gridDim.x * blockDim.x

    //To allow templated, dynamic shared memory, we set the 
    //smem to be uchar and reinterpret as templated type.
    extern __shared__ __align__(sizeof(T)) unsigned char buffReduce[];
    // cargo cult : __syncthreads();

    //Set contribution of this thread. 0 if out of bounds.
    T *reduce = reinterpret_cast<T*>(buffReduce);
    T threadsum = T(0);
    for(; idx < len; idx += stride)
        threadsum += buffer[idx];

    // store thread local partial reduction to shared memory
    reduce[tIdx] = threadsum;
    __syncthreads();

    // etc

[警告：显然从未编译或运行，使用风险自负]

基本上，这个设计会让每个线程尝试迭代尽可能多的输入数据点，以确保内存合并的方式处理所有输入数据。

Tensorflow CUDA减少Op没有完全减少

1 个答案: