我用C ++和CUDA编写的TensorFlow r1.5操作的一部分涉及减少Tensor。我已经实现了简单的交织缩减算法,如here所述。但是,似乎并没有减少整个缓冲区。
template<typename T>
void blockReduceDevice(const T *buffer, T *out, size_t len) {
const size_t tIdx = threadIdx.x;
const size_t bIdx = blockIdx.x;
const size_t bDim = blockDim.x;
const size_t idx = bIdx * bDim + tIdx;
//To allow templated, dynamic shared memory, we set the
//smem to be uchar and reinterpret as templated type.
extern __shared__ __align__(sizeof(T)) unsigned char buffReduce[];
//Set contribution of this thread. 0 if out of bounds.
T *reduce = reinterpret_cast<T*>(buffReduce);
reduce[tIdx] = (idx >= len) ? 0.0 : buffer[idx];
//Block reduce.
#pragma unroll
for (int i = bDim >> 1; i >= 1; i >>= 1) {
if(tIdx < i) {
reduce[tIdx] += reduce[tIdx + i];
if(tIdx == 0) {
out[bIdx] = reduce[tIdx];
template<typename T>
void testReduce(const T *buffer, T *blockVals, const GPUDevice &dev, size_t len) {
//Get CUDA stream.
const cudaStream_t &stream = dev.stream();
//Get launch configuration for reduction operation.
const auto reduceConfig = tensorflow::GetCudaLaunchConfig(len, dev);
const size_t blocks = reduceConfig.block_count;
const size_t threads = reduceConfig.thread_per_block;
const size_t shared = threads * sizeof(T);
//Reset buffer to known value.
std::vector<T> knownValsHost(len, 1.0);
cudaMemcpyAsync(buffer, &knownValsHost[0], len * sizeof(T), cudaMemcpyHostToDevice, stream);
//Reset output to nought.
std::vector<T> tmp(blocks, 0.0);
cudaMemcpyAsync(blockVals, &tmp[0], blocks * sizeof(T), cudaMemcpyHostToDevice, stream);
//Reduce on the GPU.
blockReduceDevice<T><<<blocks, threads, shared, stream>>>(buffer, blockVals, len);
//Further reduce on the CPU.
std::vector<T> blockValsHost(blocks, 0.0);
cudaMemcpyAsync(&blockValsHost[0], blockVals, blocks * sizeof(T), cudaMemcpyDeviceToHost, stream);
const T resGPU = std::accumulate(blockValsHost.begin(), blockValsHost.end(), static_cast<T>(0));
//Get result when copying buffer to CPU memory and reducing.
std::vector<T> bufferHost(len, 0.0);
cudaMemcpyAsync(&bufferHost[0], buffer, len * sizeof(T), cudaMemcpyDeviceToHost, stream);
const T resCPU = std::accumulate(bufferHost.begin(), bufferHost.end(), static_cast<T>(0));
//Print some output for diagnostics.
std::cout << "Length: " << len << std::endl;
std::cout << "Num CUDA Blocks: " << blocks << std::endl;
std::cout << "Num CUDA Threads Per Block: " << threads << std::endl;
std::cout << "GPU Result: " << resGPU << std::endl;
std::cout << "CPU Result: " << resCPU << std::endl;
Length: 32768
Num CUDA Blocks: 10
Num CUDA Threads Per Block: 1024
GPU Result: 10240
CPU Result: 32768
的CPU减少按预期工作(如len == resCPU
)。这使我相信CUDA内核没有完全执行为blocks * threads != len
答案 0 :(得分:2)
为什么TensorFlow会为我提供发布 没有执行适当线程数的配置?
template<typename T>
void blockReduceDevice(const T *buffer, T *out, size_t len) {
const size_t tIdx = threadIdx.x;
const size_t bIdx = blockIdx.x;
const size_t bDim = blockDim.x;
const size_t idx = bIdx * bDim + tIdx;
const size_t stride = gridDim.x * blockDim.x
//To allow templated, dynamic shared memory, we set the
//smem to be uchar and reinterpret as templated type.
extern __shared__ __align__(sizeof(T)) unsigned char buffReduce[];
// cargo cult : __syncthreads();
//Set contribution of this thread. 0 if out of bounds.
T *reduce = reinterpret_cast<T*>(buffReduce);
T threadsum = T(0);
for(; idx < len; idx += stride)
threadsum += buffer[idx];
// store thread local partial reduction to shared memory
reduce[tIdx] = threadsum;
// etc