Cuda将char **传递给内核

时间:2016-01-10 01:26:57

标签: c++ cuda




char ** pwdAry;
pwdAry = new char *[numberOfPwd];

//pwdAry given some values (flat 2d array layout)
const int pwdArySize = sizeof(pwdAry);    
dim3 grid(gridSize,gridSize);
dim3 block(blockSize,blockSize);

searchKeywordKernel << <grid, block >> >(pwdAry);



__global__ void searchKeywordKernel(char **passwordList)
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int pitch = blockDim.x * gridDim.x;
    int idx = x + y * pitch;
    int tidy = idx / pitch;
    int tidx = idx - (pitch * tidy);
    int bidx = tidx / blockDim.x;
    int bidy = tidy / blockDim.y;
    int currentThread = threadIdx.x + blockDim.x * threadIdx.y;

    printf("hi, i am thread: %i, and my block x: %i, and y: %i\n", currentThread, bidx, bidy);
    printf("My password is: %s\n", passwordList[currentThread]);

1 个答案:

答案 0 :(得分:5)


  1. 使用“扁平”数组。对于询问如何处理双指针数组(char **或任何其他类型)或包含embedded pointers的任何数据结构的初学者,这是传统建议。基本思想是创建一个相同类型的单个指针数组(例如char *),并将所有数据复制到该数组,端到端。在这种情况下,由于数组元素的长度可变,我们还需要传递一个包含每个字符串的起始索引的数组(在本例中)。

  2. 使用直接双指针方法。我认为这段代码很难写。它也可能具有性能影响。规范示例是here,并且在算法上对here和/或here所需要的逐步描述是3D(即三指针)工作示例与方法描述(哎呀! )。这基本上是在CUDA中进行深层复制,我认为它比典型的CUDA编码要困难一些。

  3. 使用managed memory中提供的CUDA platforms that support it子系统。编码方面,这可能比上述两种方法中的任何一种都简单。

  4. 以下是所有3种方法的实例:

    $ cat
    #include <stdio.h>
    #include <string.h>
    #define nTPB 256
    __global__ void kern_1D(char *data, unsigned *indices, unsigned num_strings){
      int idx = threadIdx.x+blockDim.x*blockIdx.x;
      if (idx < num_strings)
        printf("Hello from thread %d, my string is %s\n", idx, data+indices[idx]);
    __global__ void kern_2D(char **data, unsigned num_strings){
      int idx = threadIdx.x+blockDim.x*blockIdx.x;
      if (idx < num_strings)
        printf("Hello from thread %d, my string is %s\n", idx, data[idx]);
    int main(){
      const int num_strings = 3;
      const char s0[] = "s1\0";
      const char s1[] = "s2\0";
      const char s2[] = "s3\0";
      int ds[num_strings];
      ds[0] = sizeof(s0)/sizeof(char);
      ds[1] = sizeof(s1)/sizeof(char);
      ds[2] = sizeof(s2)/sizeof(char);
      // pretend we have a dynamically allocated char** array
      char **data;
      data = (char **)malloc(num_strings*sizeof(char *));
      data[0] = (char *)malloc(ds[0]*sizeof(char));
      data[1] = (char *)malloc(ds[1]*sizeof(char));
      data[2] = (char *)malloc(ds[2]*sizeof(char));
      // initialize said array
      strcpy(data[0], s0);
      strcpy(data[1], s1);
      strcpy(data[2], s2);
      // method 1: "flattening"
      char *fdata = (char *)malloc((ds[0]+ds[1]+ds[2])*sizeof(char));
      unsigned *ind   = (unsigned *)malloc(num_strings*sizeof(unsigned));
      unsigned next = 0;
      for (int i = 0; i < num_strings; i++){
        strcpy(fdata+next, data[i]);
        ind[i] = next;
        next += ds[i];}
      //copy to device
      char *d_fdata;
      unsigned *d_ind;
      cudaMalloc(&d_fdata, next*sizeof(char));
      cudaMalloc(&d_ind, num_strings*sizeof(unsigned));
      cudaMemcpy(d_fdata, fdata, next*sizeof(char), cudaMemcpyHostToDevice);
      cudaMemcpy(d_ind, ind, num_strings*sizeof(unsigned), cudaMemcpyHostToDevice);
      printf("method 1:\n");
      kern_1D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_fdata, d_ind, num_strings);
      //method 2: "2D" (pointer-to-pointer) array
      char **d_data;
      cudaMalloc(&d_data, num_strings*sizeof(char *));
      char **d_temp_data;
      d_temp_data = (char **)malloc(num_strings*sizeof(char *));
      for (int i = 0; i < num_strings; i++){
        cudaMalloc(&(d_temp_data[i]), ds[i]*sizeof(char));
        cudaMemcpy(d_temp_data[i], data[i], ds[i]*sizeof(char), cudaMemcpyHostToDevice);
        cudaMemcpy(d_data+i, &(d_temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);}
      printf("method 2:\n");
      kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_data, num_strings);
      // method 3: managed allocations
      // start over with a managed char** array
      char **m_data;
      cudaMallocManaged(&m_data, num_strings*sizeof(char *));
      cudaMallocManaged(&(m_data[0]), ds[0]*sizeof(char));
      cudaMallocManaged(&(m_data[1]), ds[1]*sizeof(char));
      cudaMallocManaged(&(m_data[2]), ds[2]*sizeof(char));
      // initialize said array
      strcpy(m_data[0], s0);
      strcpy(m_data[1], s1);
      strcpy(m_data[2], s2);
      // call kernel directly on managed data
      printf("method 3:\n");
      kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(m_data, num_strings);
      return 0;
    $ nvcc -arch=sm_35 -o t1035
    $ cuda-memcheck ./t1035
    ========= CUDA-MEMCHECK
    method 1:
    Hello from thread 0, my string is s1
    Hello from thread 1, my string is s2
    Hello from thread 2, my string is s3
    method 2:
    Hello from thread 0, my string is s1
    Hello from thread 1, my string is s2
    Hello from thread 2, my string is s3
    method 3:
    Hello from thread 0, my string is s1
    Hello from thread 1, my string is s2
    Hello from thread 2, my string is s3
    ========= ERROR SUMMARY: 0 errors


    1. 如果您是第一次测试它,我建议您使用cuda-memcheck运行此代码。为简洁起见,我省略了proper cuda error checking,但我建议您在使用CUDA代码时遇到问题。正确执行此代码取决于是否有可用的托管内存子系统(请阅读我提供的文档链接)。如果您的平台不支持它,那么按原样运行此代码可能会导致seg错误,因为我没有包含正确的错误检查

    2. 将双指针数组从设备复制到主机虽然在本示例中未明确涵盖,但实际上与3种方法中的每一种的步骤相反。对于方法1,单个cudaMemcpy调用可以执行此操作。对于方法2,它需要一个for循环来反转复制到设备的步骤(包括使用临时指针)。对于方法3,在尝试再次从主机代码访问设备之前,除了正确遵守托管内存编码实践(例如在内核调用后使用cudaDeviceSynchronize())之外,根本不需要任何操作。

      < / LI>
    3. 在提供将char **数组传递给CUDA内核的方法方面,我不想争论方法1和3是否明确地遵循问题的字母。如果你的重点是狭窄,那么请使用方法2,否则完全忽略这个答案。

    4. 编辑:根据以下评论中的问题,以下是使用主机端字符串的不同初始化序列修改的上述代码(在第42行)。现在有编译警告,但这些警告来自OP特别要求使用的代码:

      $ cat
      #include <stdio.h>
      #include <string.h>
      #define nTPB 256
      __global__ void kern_1D(char *data, unsigned *indices, unsigned num_strings){
        int idx = threadIdx.x+blockDim.x*blockIdx.x;
        if (idx < num_strings)
          printf("Hello from thread %d, my string is %s\n", idx, data+indices[idx]);
      __global__ void kern_2D(char **data, unsigned num_strings){
        int idx = threadIdx.x+blockDim.x*blockIdx.x;
        if (idx < num_strings)
          printf("Hello from thread %d, my string is %s\n", idx, data[idx]);
      int main(){
        const int num_strings = 3;
      #if 0
        const char s0[] = "s1\0";
        const char s1[] = "s2\0";
        const char s2[] = "s3\0";
        int ds[num_strings];
        ds[0] = sizeof(s0)/sizeof(char);
        ds[1] = sizeof(s1)/sizeof(char);
        ds[2] = sizeof(s2)/sizeof(char);
        // pretend we have a dynamically allocated char** array
        char **data;
        data = (char **)malloc(num_strings*sizeof(char *));
        data[0] = (char *)malloc(ds[0]*sizeof(char));
        data[1] = (char *)malloc(ds[1]*sizeof(char));
        data[2] = (char *)malloc(ds[2]*sizeof(char));
        // initialize said array
        strcpy(data[0], s0);
        strcpy(data[1], s1);
        strcpy(data[2], s2);
        char ** pwdAry; pwdAry = new char *[num_strings]; for (int a = 0; a < num_strings; a++) { pwdAry[a] = new char[1024]; } for (int a = 0; a < 3; a++) { pwdAry[a] = "hello\0"; }
        // method 1: "flattening"
        char *fdata = (char *)malloc((1024*num_strings)*sizeof(char));
        unsigned *ind   = (unsigned *)malloc(num_strings*sizeof(unsigned));
        unsigned next = 0;
        for (int i = 0; i < num_strings; i++){
          memcpy(fdata+next, pwdAry[i], 1024);
          ind[i] = next;
          next += 1024;}
        //copy to device
        char *d_fdata;
        unsigned *d_ind;
        cudaMalloc(&d_fdata, next*sizeof(char));
        cudaMalloc(&d_ind, num_strings*sizeof(unsigned));
        cudaMemcpy(d_fdata, fdata, next*sizeof(char), cudaMemcpyHostToDevice);
        cudaMemcpy(d_ind, ind, num_strings*sizeof(unsigned), cudaMemcpyHostToDevice);
        printf("method 1:\n");
        kern_1D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_fdata, d_ind, num_strings);
        //method 2: "2D" (pointer-to-pointer) array
        char **d_data;
        cudaMalloc(&d_data, num_strings*sizeof(char *));
        char **d_temp_data;
        d_temp_data = (char **)malloc(num_strings*sizeof(char *));
        for (int i = 0; i < num_strings; i++){
          cudaMalloc(&(d_temp_data[i]), 1024*sizeof(char));
          cudaMemcpy(d_temp_data[i], pwdAry[i], 1024*sizeof(char), cudaMemcpyHostToDevice);
          cudaMemcpy(d_data+i, &(d_temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);}
        printf("method 2:\n");
        kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(d_data, num_strings);
        // method 3: managed allocations
        // start over with a managed char** array
        char **m_data;
        cudaMallocManaged(&m_data, num_strings*sizeof(char *));
        cudaMallocManaged(&(m_data[0]), 1024*sizeof(char));
        cudaMallocManaged(&(m_data[1]), 1024*sizeof(char));
        cudaMallocManaged(&(m_data[2]), 1024*sizeof(char));
        // initialize said array
        for (int i = 0; i < num_strings; i++)
          memcpy(m_data[i], pwdAry[i], 1024);
        // call kernel directly on managed data
        printf("method 3:\n");
        kern_2D<<<(num_strings+nTPB-1)/nTPB, nTPB>>>(m_data, num_strings);
        return 0;
      $ nvcc -arch=sm_35 -o t1036 warning: conversion from a string literal to "char *" is deprecated
    warning: conversion from a string literal to "char *" is deprecated
      $ cuda-memcheck ./t1036
      ========= CUDA-MEMCHECK
      method 1:
      Hello from thread 0, my string is hello
      Hello from thread 1, my string is hello
      Hello from thread 2, my string is hello
      method 2:
      Hello from thread 0, my string is hello
      Hello from thread 1, my string is hello
      Hello from thread 2, my string is hello
      method 3:
      Hello from thread 0, my string is hello
      Hello from thread 1, my string is hello
      Hello from thread 2, my string is hello
      ========= ERROR SUMMARY: 0 errors