Cuda奇怪的Printf

时间:2014-08-14 19:33:43

标签: cuda

更新后,我写了一个测试程序来测试以下想法:

  1. cuda流将数据复制到gpu
  2. 第二个cuda流读取和处理数据。
  3. 仅当第二个流处理先前的数据时,才会通过第一个流复制另外一个数据。
  4. 但是,它不起作用:只复制第一个数据然后在那里等待。

    > #include "cuda.h"
    #include <iostream>
    #include <pthread.h>
    
    const int UNPROCESSED = 1;
    const int PROCESSED = 2;
    const int DONE = 3;
    const int RUNNING= 0;
    const int NUM_OF_DATA = 100;
    const int NUM_OF_BLOCKS = 1;
    const int THREADS_PER_BLOCK = 1;
    
    //int data_states[NUM_OF_DATA];
    cudaStream_t cuda_stream[2];
    volatile int* process_state;
    volatile int* d_process_state;
    volatile int* d_copier_state;
    int* d_data_state;
    int* h_data_states;
    cudaError_t cuda_status;
    
    using namespace std;
    
    void* copy_data(void* arg){
       int i=0;
       //cout << "in copy_data" << endl;
       while(i < NUM_OF_DATA){
          if (*process_state != UNPROCESSED){
             cout << "Now copy data " << i << " with state = " << h_data_states[i] <<  endl;
             *process_state = UNPROCESSED;
             cuda_status = cudaMemcpyAsync(d_data_state, &h_data_states[i], sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
             if (cuda_status != cudaSuccess){
                cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
             }
             i++;
          }
       }
       int copier_state = DONE;
       cudaMemcpyAsync((void*) d_copier_state, &copier_state, sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
    }
    
    __global__ void process_data(volatile int* data_state, volatile int* process_state, volatile int* copier_state){
       int i = 0;
       printf(" i = %d\n", i);
       while(*copier_state != DONE){
          printf(" i = %d, copier_state = %d, data_state = %d\n", i, *copier_state, *data_state);
          if(*data_state == UNPROCESSED){
            printf("now processing data %d\n", i);
            i++;
            // process data here, skipped
            *process_state = PROCESSED;
            *data_state = PROCESSED;
            //__threadfence_system();
          }
       }
       printf("process_data is done\n");
    }
    
    int main(int argc, char **argv){
      int i;
    
      cudaSetDeviceFlags(cudaDeviceMapHost);
    
      cuda_status = cudaMallocHost((void**) &process_state, NUM_OF_BLOCKS*sizeof(int),  cudaHostAllocMapped);
      if (cuda_status != cudaSuccess){
          cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
      }
      cudaHostGetDevicePointer((int**) &d_process_state, (int*) process_state, 0);
    
      cuda_status = cudaMalloc((void**) &d_copier_state, NUM_OF_BLOCKS*sizeof(int));
      if (cuda_status != cudaSuccess){
          cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
      }
      cudaMemset((void*)d_copier_state, RUNNING, sizeof(int));
    
      cuda_status = cudaMallocHost((void**) &h_data_states, NUM_OF_DATA*sizeof(int), 0);
      if (cuda_status != cudaSuccess){
          cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
      }
      for(i = 0; i < NUM_OF_DATA; i++){
         h_data_states[i] = UNPROCESSED;
      }
    
      cudaStreamCreate(&cuda_stream[0]);
      cudaStreamCreate(&cuda_stream[1]);
     pthread_t thread;
     int thread_state = pthread_create(&thread, NULL, &copy_data, h_data_states);
      if(thread_state){
         cout << "Error: unable to create thread (produce_instances), "<< thread_state << endl;
         exit(-1);
      }
    
    
      //cout << "Starting kernel" << endl;
      process_data<<<NUM_OF_BLOCKS, THREADS_PER_BLOCK, 0, cuda_stream[1]>>>(d_data_state, d_process_state, d_copier_state);
    
    
      cudaDeviceSynchronize();
      cudaFree(d_data_state);
      cudaFree((void*) d_copier_state);
      cudaFreeHost((void*) process_state);
    
      return 0;
    }
    

    我的程序在映射内存中有一个变量(state)(cudaMallocHost带有cudaHostAllocMapped标志)。在CPU上,变量由指针(state_pointer)访问,而在gpu上,相应的指针是d_state_pointer

    CPU通过state_pointer将变量设置为UNPROCESSED,然后gpu检查d_state_variable:如果它是UNPROCESSED,则处理一些内容然后将其更改为PROCESSED。

    我的问题是

    1. 似乎gpu没有读取d_state_pointer的正确值,所以总是等待d_state_pointer更改为UNPROCESSED。
    2. 但是,如果在if (threadIdx.x==0) printf("weird\n");中添加了while loop语句,则会读取d_state_pointer的正确值(应该是UNPROCESSED),然后继续。
    3. Cuda文档提到printf可能会改变线程执行的顺序。但是我不明白为什么没有d_state_pointer语句就无法读取if-printf的正确值?而且,如果没有if(threadIdx.x==0)printf语句本身就无济于事。另一方面,如果没有printfif(threadIdx.x==0)也无济于事。

      有什么建议吗?

1 个答案:

答案 0 :(得分:3)

可能d_state_pointer不是volatile。这意味着GPU可以自由地将值缓存在L2中,并且不知道主机是否/何时更新它。

改为使用volatile变量/指针。每次访问时都是makes the GPU code retrieve the value from the source,而不是缓存。此外,要确保更新朝另一个方向发展,请使用__threadfence()

如果您需要帮助,请查看此answer,或发布一个简短,完整,可编译的代码来说明问题(SO期望this)。