__constant__ float constbuf[MAXSIZE]
__device__ float *d_buf;
__global__ void
simple (float *buf2){
//access buf2;
}
main(){
int size, asize;
float *abuf, *d_buf2, *h_buf;
//...
if(size > MAXSIZE){
cudaMalloc(&d_buf2, asize);
cudaMemcpy(d_buf2, h_buf, asize);
cudaMemcpyToSymbol(d_buf, &d_buf2, sizeof(d_buf2));
cudaGetSymbolAddress((void **) &abuf, d_buf);
}else{
cudaMemcpyToSymbol(constbuf, h_buf, asize);
cudaGetSymbolAddress((void **) &abuf, constbuf);
}
simple<<<grid, block, 0 ,stream>>>(abuf);
}
我想做上面的事情,但我发现内核没有得到正确的缓冲区。反正有没有实现这个目标?如果可能的话,我不想在内核中添加“if”条件
答案 0 :(得分:2)
对此最好的解决方案是让一个__device__
内核完成大部分工作,两个__global__
内核包装__device__
内核。
例如:
__constant__ c_buf[MAXSIZE];
__device__ simple_core(float *buf, int len)
{
// do something here.
}
__global__ simple_global_mem(float *d_buf, int len)
{
simple_core(d_buf, len);
}
__global__ simple_const_mem(int len)
{
simple_core(c_buf, len);
}
int main()
{
// other code
if (len < MAXSIZE) {
// cuda memcpy to symbol code here
simple_const_mem<<<threads, blocks>>>(len);
}
else {
simple_global_mem<<<threads, blocks>>>(d_buf, len):
}
}