我写了一个测试程序来测试以下想法:(1)cuda流将数据复制到gpu。副本在pthread中完成。 (2)第二个cuda流读取和处理数据。 (3)只有当第二个流处理先前的数据时,才会通过第一个流复制另外一个数据。
但是,它不起作用:只复制第一个数据然后在那里等待。
#include "cuda.h"
#include <iostream>
#include <pthread.h>
const int UNPROCESSED = 1;
const int PROCESSED = 2;
const int DONE = 3;
const int RUNNING= 0;
const int NUM_OF_DATA = 100;
const int NUM_OF_BLOCKS = 1;
const int THREADS_PER_BLOCK = 1;
//int data_states[NUM_OF_DATA];
cudaStream_t cuda_stream[2];
volatile int* process_state;
volatile int* d_process_state;
volatile int* d_copier_state;
int* d_data_state;
int* h_data_states;
cudaError_t cuda_status;
using namespace std;
void* copy_data(void* arg){
int i=0;
//cout << "in copy_data" << endl;
while(i < NUM_OF_DATA){
if (*process_state != UNPROCESSED){
cout << "Now copy data " << i << " with state = " << h_data_states[i] << endl;
*process_state = UNPROCESSED;
cuda_status = cudaMemcpyAsync(d_data_state, &h_data_states[i], sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
if (cuda_status != cudaSuccess){
cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
}
i++;
}
}
int copier_state = DONE;
cudaMemcpyAsync((void*) d_copier_state, &copier_state, sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
}
__global__ void process_data(volatile int* data_state, volatile int* process_state, volatile int* copier_state){
int i = 0;
printf(" i = %d\n", i);
while(*copier_state != DONE){
printf(" i = %d, copier_state = %d, data_state = %d\n", i, *copier_state, *data_state);
if(*data_state == UNPROCESSED){
printf("now processing data %d\n", i);
i++;
// process data here, skipped
*process_state = PROCESSED;
*data_state = PROCESSED;
//__threadfence_system();
}
}
printf("process_data is done\n");
}
int main(int argc, char **argv){
int i;
cudaSetDeviceFlags(cudaDeviceMapHost);
cuda_status = cudaMallocHost((void**) &process_state, NUM_OF_BLOCKS*sizeof(int), cudaHostAllocMapped);
if (cuda_status != cudaSuccess){
cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
}
cudaHostGetDevicePointer((int**) &d_process_state, (int*) process_state, 0);
cuda_status = cudaMalloc((void**) &d_copier_state, NUM_OF_BLOCKS*sizeof(int));
if (cuda_status != cudaSuccess){
cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
}
cudaMemset((void*)d_copier_state, RUNNING, sizeof(int));
cuda_status = cudaMallocHost((void**) &h_data_states, NUM_OF_DATA*sizeof(int), 0);
if (cuda_status != cudaSuccess){
cout << "Error when allocating pinned host memory (full_instance_states)" << endl;
}
for(i = 0; i < NUM_OF_DATA; i++){
h_data_states[i] = UNPROCESSED;
}
cudaStreamCreate(&cuda_stream[0]);
cudaStreamCreate(&cuda_stream[1]);
pthread_t thread;
int thread_state = pthread_create(&thread, NULL, ©_data, h_data_states);
if(thread_state){
cout << "Error: unable to create thread (produce_instances), "<< thread_state << endl;
exit(-1);
}
//cout << "Starting kernel" << endl;
process_data<<<NUM_OF_BLOCKS, THREADS_PER_BLOCK, 0, cuda_stream[1]>>>(d_data_state, d_process_state, d_copier_state);
cudaDeviceSynchronize();
cudaFree(d_data_state);
cudaFree((void*) d_copier_state);
cudaFreeHost((void*) process_state);
return 0;
}
答案 0 :(得分:1)
您永远不会以任何方式分配d_data_state
。它是整个程序中的NULL指针。
因此此处的使用无效:
cuda_status = cudaMemcpyAsync(d_data_state, &h_data_states[i], sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
当我运行你的程序时,我从下一行代码中得到错误打印输出。
由于您的内核也使用d_data_state
(这是一个无效指针),如果我使用cuda-memcheck
运行您的代码,我会收到各种无效的全局读取错误。
由于您尚未为d_data_state
分配任何内容,因此您的代码无法正常运行。
您的代码中还有其他几个问题。仅举一个例子:
int copier_state = DONE;
cudaMemcpyAsync((void*) d_copier_state, &copier_state, sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
为了使cudaMemcpyAsync
按预期工作(即异步,并与其他流活动重叠),主机存储器必须是固定存储区。 int copier_state = DONE;
不会创建固定分配,因此从中复制会破坏cudaMemcpyAsync
操作的异步重叠。
以下是适用于我的代码版本(现已针对竞争条件进行了一些额外防范):
#include <iostream>
#include <pthread.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
const int UNPROCESSED = 1;
const int PROCESSED = 2;
const int DONE = 3;
const int RUNNING= 0;
const int NUM_OF_DATA = 100;
const int NUM_OF_BLOCKS = 1;
const int THREADS_PER_BLOCK = 1;
//int data_states[NUM_OF_DATA];
cudaStream_t cuda_stream[2];
volatile int* process_state;
volatile int* d_process_state;
volatile int* d_copier_state;
int* d_data_state;
int* h_data_states;
int* h_copier_state;
cudaError_t cuda_status;
using namespace std;
void* copy_data(void* arg){
int i=0;
cudaSetDevice(0);
//cout << "in copy_data" << endl;
while(i < NUM_OF_DATA){
if (*process_state != UNPROCESSED){
// cout << "Now copy data " << i << " with state = " << h_data_states[i] << endl;
*process_state = UNPROCESSED;
cudaMemcpyAsync(d_data_state, &(h_data_states[i]), sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
cudaStreamSynchronize(cuda_stream[0]);
cudaCheckErrors("thread cudaMemcpyAsync fail");
//*process_state = UNPROCESSED;
i++;
}
}
*h_copier_state = DONE;
cudaMemcpyAsync((void *)d_copier_state, h_copier_state, sizeof(int), cudaMemcpyHostToDevice, cuda_stream[0]);
cudaCheckErrors("thread cudaMemcpyAsync 2 fail");
// cout << "Thread finished" << endl;
return NULL;
}
__global__ void process_data(volatile int* data_state, volatile int* process_state, volatile int* copier_state){
int i = 0;
//printf(" i = %d\n", i);
while(*copier_state != DONE){
//printf(" i = %d, copier_state = %d, data_state = %d\n", i, *copier_state, *data_state);
if(*data_state == UNPROCESSED){
//printf("now processing data %d\n", i);
i++;
// process data here, skipped
*data_state = PROCESSED;
__threadfence_system();
*process_state = PROCESSED;
__threadfence_system();
}
}
// printf("process_data is done\n");
}
int main(int argc, char **argv){
int i;
cudaSetDevice(0);
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaMallocHost((void**) &process_state, NUM_OF_BLOCKS*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaMallocHost 1 fail");
cudaHostGetDevicePointer((int**) &d_process_state, (int*) process_state, 0);
cudaMalloc((void**) &d_copier_state, sizeof(int));
cudaCheckErrors("cudaMalloc 1 fail");
cudaMemset((void*)d_copier_state, RUNNING, sizeof(int));
cudaMallocHost((void**) &h_copier_state, sizeof(int), 0);
cudaCheckErrors("cudaMallocHost 3 fail");
*h_copier_state = RUNNING;
cudaMallocHost((void**) &h_data_states, NUM_OF_DATA*sizeof(int), 0);
cudaCheckErrors("cudaMallocHost 2 fail");
for(i = 0; i < NUM_OF_DATA; i++){
h_data_states[i] = UNPROCESSED;
}
cudaMalloc((void**) &d_data_state, sizeof(int));
cudaCheckErrors("cudaMalloc 2 fail");
cudaMemcpy((void*)d_data_state, &(h_data_states[0]), sizeof(int), cudaMemcpyHostToDevice);
cudaStreamCreate(&cuda_stream[0]);
cudaStreamCreate(&cuda_stream[1]);
pthread_t thread;
int thread_state = pthread_create(&thread, NULL, ©_data, NULL);
if(thread_state){
cout << "Error: unable to create thread (produce_instances), "<< thread_state << endl;
exit(-1);
}
//cout << "Starting kernel" << endl;
process_data<<<NUM_OF_BLOCKS, THREADS_PER_BLOCK, 0, cuda_stream[1]>>>(d_data_state, d_process_state, d_copier_state);
cudaDeviceSynchronize();
return 0;
}
顺便说一句,没有必要让pthreads的所有复杂性运行一个额外的线程。在cuda内核启动之后,所有的pthread代码都可以插入主主机线程中,并且您的程序仍然可以正常工作。在内核启动后,主机线程与设备内核异步运行,并与之并行运行。