我正在尝试执行以下操作: 我假设我有一个带有异构处理单元(PU)的系统,包括CPU,GPU和Intel Xeon Phis。 GPU设备也可以具有不同的特征。因此,在这些设备之间拆分工作负载并不像N / num_devices那么简单。
omp_set_num_threads(system->getPUCount());
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
每个线程循环迭代,直到到达数据末尾。
PU pu = listOfPUs[cpu_thread_id];
//threads are active until all data is processed
while (finish_0 < N) {
//the my_start and my_finish are private to a specific device.
int my_start = 0;
int my_finish = 0;
我为每个PU设置了一个常量chunk_size,并且我创建了与我有PU一样多的CPU线程,这意味着每个CPU线程控制一个PU。 每个线程确定自己的数据块的开始和结束(关键代码部分)
#pragma omp critical (chunkdetermination_0)
{
start_0 = finish_0;
finish_0 = start_0 + pu.getChunkSize();
if(finish_0 > N)
finish_0 = N;
my_start = start_0;
my_finish = finish_0;
}
现在我检查PU的类型,并执行相应的内核。
if(pu.getType() == GPU) {
int myN = my_finish-my_start;
CudaSafeCall(cudaSetDevice(pu.getId()));
unsigned int nbytes_per_kernel = sizeof(double)*myN;
//memory allocation
CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));
CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
//data transfer
CudaSafeCall(cudaMemcpy(d_a, a+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
CudaSafeCall(cudaMemcpy(d_c, c+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
//block and grid values
dim3 gpu_threads(128);
dim3 gpu_blocks(myN/gpu_threads.x);
if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;
//execute kernel
kernel_0<<<gpu_blocks,gpu_threads>>>( d_a, d_c, myN);
//data transfer device to host
CudaSafeCall(cudaMemcpy(c+my_start, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));
//sycnhronize devices
CudaSafeCall(cudaDeviceSynchronize());
// //free device memory
CudaSafeCall(cudaFree(d_a));
CudaSafeCall(cudaFree(d_c));
}
当我用一个GPU测试这个代码时,它工作正常。但是,当我用两个GPU测试它时,它不起作用。我也尝试过使用cuda流,但遗憾的是无法管理它。
任何建议我做错了什么,或者我该如何解决这个问题?
以下是完整的示例:
#include <omp.h>
#include <stdio.h>
#include <vector>
#include <iostream>
#include <sys/time.h>
#include <float.h>
#include <limits.h>
using namespace std;
#define CPU 0
#define GPU 1
#define MIC 2
class PU
{
public:
PU(int puId, int puType)
{
id = puId;
type = puType;
}
int getId() {
return id;
}
void setId(int puId) {
id = puId;
}
int getType() {
return type;
}
char * getTypeAsString() {
if(type == CPU)
return (char *) "CPU";
else if (type == GPU)
return (char *) "GPU";
else
return (char *) "MIC";
}
void setType(int puType) {
type = puType;
}
int getChunkSize() {
return chunkSize;
}
void setChunkSize(int puChunkSize) {
chunkSize = puChunkSize;
}
private:
int id;
int type;
int chunkSize;
};
class System
{
public:
System() {
numOfPUs = 0;
//Adding PU0 of type GPU to the system
PU * pu0 = new PU(0, GPU);
pu0->setChunkSize(262144);
listOfPUs.push_back(*pu0);
numOfPUs ++;
//Adding PU1 of type GPU to the system
PU * pu1 = new PU(1, GPU);
pu1->setChunkSize(262144);
listOfPUs.push_back(*pu1);
numOfPUs ++;
}
vector<PU> getPUs() {
return listOfPUs;
}
int getPUCount() {
return numOfPUs;
}
private:
vector<PU> listOfPUs;
int numOfPUs;
};
#define N 2097152
//********************** CUDA Error checker **********************
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
if ( cudaSuccess != err )
{
fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
inline void __cudaCheckError( const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if ( cudaSuccess != err )
{
fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
// More careful checking. However, this will affect performance.
// Comment away if needed.
err = cudaDeviceSynchronize();
if( cudaSuccess != err )
{
fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
//********************** CUDA Error checker **********************
__global__ void kernel_0(double * a, double * c, int len)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < len)
{
c[idx] = a[idx];
}
}
/*
* Start of Generated Code
* This code enables execution on both host CPUs and accelerating devices
*/
void hybrid_function_0 (double *a, double *c)
{
System * system = new System();
//device variable declarations
double *d_a;
double *d_c;
//start and finish determine the chunk size of a device
int start_0 = 0;
int finish_0 = 0;
vector<PU> listOfPUs = system->getPUs();
printf("Num devices = %d\n", system->getPUCount());
omp_set_num_threads(system->getPUCount());
//one thread of the CPU controls one GPU device. The rest of CPU threads will be used to process data.
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
PU pu = listOfPUs[cpu_thread_id];
//threads are active until all data is processed
while (finish_0 < N) {
//the my_start and my_finish are private to a specific device.
int my_start = 0;
int my_finish = 0;
//the determination of chunks should be performed sequentially, in order to avoid two or more devices processing the same data.
#pragma omp critical (chunkdetermination_0)
{
start_0 = finish_0;
finish_0 = start_0 + pu.getChunkSize();
if(finish_0 > N)
finish_0 = N;
my_start = start_0;
my_finish = finish_0;
}
//devices with id less than nDevices are GPU devices. The host CPU has id = nDevices
if(pu.getType() == GPU) {
int myN = my_finish-my_start;
printf("device_id\t%d\tpu_id\t%d\ttype\t%s\tprocessing\t%d-%d (%lu KB)\n", cpu_thread_id, pu.getId(), pu.getTypeAsString(), my_start, my_finish, sizeof(double)*myN/1000);
CudaSafeCall(cudaSetDevice(pu.getId()));
unsigned int nbytes_per_kernel = sizeof(double)*myN;
//memory allocation
CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));
CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
//data transfer
CudaSafeCall(cudaMemcpy(d_a, a+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
CudaSafeCall(cudaMemcpy(d_c, c+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
//block and grid values
dim3 gpu_threads(128);
dim3 gpu_blocks(myN/gpu_threads.x);
if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;
//execute kernel
kernel_0<<<gpu_blocks,gpu_threads>>>( d_a, d_c, myN);
//data transfer device to host
CudaSafeCall(cudaMemcpy(c+my_start, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));
//sycnhronize devices
CudaSafeCall(cudaDeviceSynchronize());
// //free device memory
CudaSafeCall(cudaFree(d_a));
CudaSafeCall(cudaFree(d_c));
}
//execute on host
else if (pu.getType() == CPU) {
omp_set_num_threads(omp_get_max_threads());
#pragma omp parallel for
for (int i = my_start; i < my_finish; i++)
{
c[i] = a[i];
}
}
//execute on MIC
else if (pu.getType() == MIC) {
#pragma offload target(mic: cpu_thread_id) in(a[my_start:my_finish]) in(c[my_start:my_finish]) out(c[my_start:my_finish])
{
#pragma omp parallel for
for (int i = my_start; i < my_finish; i++)
{
c[i] = a[i];
}
}
}
}
}
}
/*
* End of Generated Code
*/
int main()
{
double *a, *b, *c;
double scalar;
/* Allocate memory on host */
a = (double*)malloc(sizeof(double)*N);
b = (double*)malloc(sizeof(double)*N);
c = (double*)malloc(sizeof(double)*N);
// omp_set_num_threads(omp_get_max_threads());
printf("OMP Max threads %d\n", omp_get_max_threads());
#pragma omp parallel
{
#pragma omp master
printf("OMP Num threads %d\n", omp_get_num_threads());
}
// #pragma omp parallel for
// for(int i = 0; i < 10; i++) {
// printf("I am thread %d\n", omp_get_thread_num());
// }
//initialization of variables
#pragma omp parallel for
for (int j=0; j<N; j++) {
a[j] = 1.0;
b[j] = 2.0;
c[j] = 0.0;
}
#pragma omp parallel for
for (int j = 0; j < N; j++)
a[j] = 2.0E0 * a[j];
scalar=3.0f;
printf("%s\n", "COPY Started");
hybrid_function_0(a, c);
printf("%s\n", "COPY Finished");
return 0;
}
你可以用:
编译它nvcc mini.cu -o mini -Xcompiler "-fopenmp"
当分配两个或多个GPU来运行此类时,我会收到不同的错误消息,有时它会挂起并且什么都不做。当我通过nvidia-smi检查GPU状态时,它显示GPU正在执行此类,但利用率为0%。
有些错误包括:
cudaSafeCall() failed at mini.cu:221 : invalid argument
cudaSafeCall() failed at mini.cu:221 : driver shutting down
答案 0 :(得分:2)
我想应该将hybrid_function_0
中的设备变量声明移到主OpenMP并行部分中,如下所示:
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
//device variable declarations
double *d_a;
double *d_c;
...
就目前而言,线程之间存在内存争用,这可能导致设备变量指针值被其他线程从错误设备分配的值所覆盖,从而导致您在API调用中报告各种无效参数错误,这些错误使用设备指针作为参数。
我希望该错误可能在任何cudaMemset
,cudaMemcpy
或cudaFree
调用中发生,或者该函数中的内核启动,并且该错误可能会在不同地方发生取决于竞争线程的行为。