我正在尝试使用cuPrint函数打印作为内核函数参数传递的String向量的元素。
内核的代码
__global__ void testKernel(string wordList[10000])
{
//access thread id
const unsigned int bid = blockIdx.x;
const unsigned int tid = threadIdx.x;
const unsigned int index = bid * blockDim.x + tid;
cuPrintf("wordList[%d]: %s \n", index, wordList[index]);
}
主函数的代码,用于设置执行参数并启动内核
//Allocate device memory for word list
string* d_wordList;
cudaMalloc((void**)&d_wordList, sizeof(string)*number_of_words);
//Copy word list from host to device
cudaMemcpy(d_wordList, wordList, sizeof(string)*number_of_words, cudaMemcpyHostToDevice);
//Setup execution parameters
int n_blocks = (number_of_words + 255)/256;
int threads_per_block = 256;
dim3 grid(n_blocks, 1, 1);
dim3 threads(threads_per_block, 1, 1);
cudaPrintfInit();
testKernel<<<grid, threads>>>(d_wordList);
cudaDeviceSynchronize();
cudaPrintfDisplay(stdout,true);
cudaPrintfEnd();
我收到错误: “错误44错误:从全局函数(”testKernel“)调用主机函数(”std :: basic_string,std :: allocator&gt; ::〜basic_string“) )不允许D:... \ kernel.cu 44 1 CUDA_BF_large_word_list “
我错过了什么? 提前谢谢。
答案 0 :(得分:1)
通常,您不能在CUDA设备代码中使用C ++库中的函数(包括<string>
)。
使用char
数组代替保存字符串。
Here是操纵&#34;字符串&#34;的一个例子。作为C样式的以null结尾的char
数组,并将它们传递给内核。
答案 1 :(得分:0)
我修改了代码,并使用了一个字符串字符数组。
内核的更新版本是:
__global__ void testKernel(char* d_wordList)
{
//access thread id
const unsigned int bid = blockIdx.x;
const unsigned int tid = threadIdx.x;
const unsigned int index = bid * blockDim.x + tid;
//cuPrintf("Hello World from kernel! \n");
cuPrintf("!! %c%c%c%c%c%c%c%c%c%c \n" , d_wordList[index * 20 + 0],
d_wordList[index * 20 + 1],
d_wordList[index * 20 + 2],
d_wordList[index * 20 + 3],
d_wordList[index * 20 + 4],
d_wordList[index * 20 + 5],
d_wordList[index * 20 + 6],
d_wordList[index * 20 + 7],
d_wordList[index * 20 + 8],
d_wordList[index * 20 + 9]);
}
我也想知道是否有更简单的方法来打印char数组中的单词。 (基本上我需要打印,后来每个内核函数使用一个单词)。
主要功能的代码是:
const int text_length = 20;
char (*wordList)[text_length] = new char[10000][text_length];
char *dev_wordList;
for(int i=0; i<number_of_words; i++)
{
file>>wordList[i];
cout<<wordList[i]<<endl;
}
cudaMalloc((void**)&dev_wordList, 20*number_of_words*sizeof(char));
cudaMemcpy(dev_wordList, &(wordList[0][0]), 20 * number_of_words * sizeof(char), cudaMemcpyHostToDevice);
char (*resultWordList)[text_length] = new char[10000][text_length];
cudaMemcpy(resultWordList, dev_wordList, 20 * number_of_words * sizeof(char), cudaMemcpyDeviceToHost);
for(int i=0; i<number_of_words; i++)
cout<<resultWordList[i]<<endl;
//Setup execution parameters
int n_blocks = (number_of_words + 255)/256;
int threads_per_block = 256;
dim3 grid(n_blocks, 1, 1);
dim3 threads(threads_per_block, 1, 1);
cudaPrintfInit();
testKernel<<<grid, threads>>>(dev_wordList);
cudaDeviceSynchronize();
cudaPrintfDisplay(stdout,true);
cudaPrintfEnd();
如果我对这样的块/线程数使用较小的值:
dim3 grid(20, 1, 1);
dim3 threads(100, 1, 1);
内核启动是正确的,每个线程显示一个单词。但我需要10000个单词的这个程序。我错过了什么?