假设我有一个CUDA GPU内核用于使用常量内存的自定义张量运算:
__constant__ int cdata[100];
__global__ void frobulate(float * data)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
float value = data[i];
for(int j=0; j < 100; ++j) {
value += cdata[i];
}
}
然后,在我的Compute
自定义操作
Frobulate
方法时
class Frobulate : public tensorflow::OpKernel
{
public:
void Compute(OpKernelContext * context) override
{
...
// Get the current device
const Device & device = context->eigen_device<Eigen::GpuDevice>();
// Local, mutating version of constant data.
// For illustration purposes only
int local_data[100];
// Reason about our local shape
TensorShape local_shape(100);
// Create a pointer to hold allocated output
Tensor * pinned_ary_ptr = nullptr;
// Allocate memory for the complex_phase,
// I don't think allocate_output is correct here...
// but we need pinned host memory for an async transfer
OP_REQUIRES_OK(context, context->allocate_output(
0, local_shape, &pinned_ary_ptr));
for(int i=0; i<100; ++i)
{ pinned_ary_ptr[i] = local_data[i]; }
// Get the symbol address of cdata and enqueue an
// async transfer on the device's stream
int * d_cdata_ptr;
cudaGetSymbolAddress((void **)&d_cdata_ptr, &cdata);
cudaMemcpyAsync(d_cdata_ptr, pinned_ary_ptr, sizeof(int)*100,
cudaMemcpyHostToDevice, device.stream());
// Call the kernel
frobulate<<<grid, blocks, 0, device.stream()>>>(data);
}
};
Input
中将cdata设为Attr
或REGISTER_OP
会很好,但我认为这不会正确地连接到常量数据。我认为cudaGetSymbolAddress
是必要的...... 编辑1:这会分配固定内存吗? (内存通常分配有cudaHostAlloc,其页面固定用于DMA传输到GPU,即它们被操作系统防止被移出)。
tensorflow::AllocatorAttributes pinned_allocator;
pinned_allocator.set_on_host(true);
pinned_allocator.set_gpu_compatible(true);
// Allocate memory for the constant data
OP_REQUIRES_OK(context, context->allocate_temp(
DT_UINT8, cdata_shape, &cdata_tensor,
pinned_allocator));
答案 0 :(得分:2)
是的,cudaGetSymbolAddress是必要的。常量内存特定于内核,不应该
不应该。只需确保流执行中的操作顺序正确,并正确同步。
是输出是内核作为操作结果写入的内存。暂存内存主要用于内核的单个操作所需的内存。像卷积一样的一些cudnn内核使用它。请参阅tensorflow/kernels/conv_ops.cc