当我在CUDA中实现它时,我的1D波动方程比在C / C ++中慢。谁能告诉我我做错了什么?这是我的代码:
__global__ void Solver1d(float* up, float* u, float* um)
{
int id;
float dx,dt;
dx = (float)L/n;
dt = (float)dx/c;
float r= c*((float)dt/dx);
float R = r*r;
// index mapping between data and threads
id = threadIdx.x + blockIdx.x*blockDim.x;
// Allowing all threads in the range of valids data to execute
if (id<n)
{
if(id==0)
{
up[id]=0;
}
else if(id==n-1)
{
up[n-1]=0;
}
else
{
up[id] = 2*u[id]-um[id]+R*(u[id+1]-2*u[id]+u[id-1]);
}
}
}
// main program
int main(int argc, char *argv[])
{
// declare all variables
int i;
float inner,L2_exact,ue[n],dx,dt;
dx = (float)L/n;
dt = (float)(0.05*dx/c); // Max time step
float r= c*((float)dt/dx);
float R = r*r;
// Allocate memory on host
//float u=(float *)malloc((n)*sizeof(float));
//float um=(float *)malloc((n)*sizeof(float));
float up[n],um[n],u[n];
//Pointers for device memory allocation
float *dev_up, *dev_u, *dev_um;
// Allocating memory to device (GPU)
HANDLE_ERROR(cudaMalloc((void**)&dev_up, n*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&dev_u, n*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&dev_um, n*sizeof(float)));
cudaEvent_t start, stop;
float elapsedTime;
// Start timer
HANDLE_ERROR(cudaEventCreate( &start ));
HANDLE_ERROR(cudaEventCreate( &stop ));
HANDLE_ERROR(cudaEventRecord( start,0 ));
//Initialize the stream
cudaStream_t stream;
HANDLE_ERROR(cudaStreamCreate( &stream ));
//Initial condition
for(i=0;i<n;i++)
{
u[i]=sin(2*PI*i*dx);
//printf("Initialization ok\n");
}
// Enforcing special formula for t = -1
for(i=1;i<n-1 ;i++)
{
um[0] = 0;
um[n-1] = 0;
um[i] = u[i] + 0.5*R*(u[i-1] - 2*u[i] + u[i+1]); //+ 0.5*dt*dt*f(i*dx,t)
//printf("um is runing fine\n");
}
// setting blocks and threads numbers
int noThreads=128;
dim3 dimBlock(noThreads,1,1);
dim3 dimGrid(1+n/(noThreads-1),1,1);
// move u and um to GPU
HANDLE_ERROR(cudaMemcpy(dev_u, u, n*sizeof(float), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_um, um, n*sizeof(float), cudaMemcpyHostToDevice));
float t=0;
//int counter=0;
while(t<=T)
{
//counter++;
t += dt;
Solver1d<<<dimGrid,dimBlock>>>(dev_up,dev_u,dev_um);
// cudaDeviceSynchronize();
for(i=0;i<n;i++)
{
um[i] = u[i];
u[i] = up[i];
}
}
HANDLE_ERROR(cudaEventRecord( stop,0 ));
HANDLE_ERROR(cudaEventSynchronize( stop ));
HANDLE_ERROR(cudaEventElapsedTime( &elapsedTime,start,stop ));
HANDLE_ERROR(cudaEventDestroy( start ));
HANDLE_ERROR(cudaEventDestroy( stop ));
printf("elapsed time: %lf sec\n",elapsedTime/1000);
// move the solution up from GPU to CPU
HANDLE_ERROR(cudaMemcpy(up, dev_up, n*sizeof(float), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(u, dev_u, n*sizeof(float), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(um, dev_um, n*sizeof(float), cudaMemcpyDeviceToHost));
int j;
float L2cpuSolution=0.0;
float L2gpuSolution=0.0;
float ERROR_PERCENTAGE=0.0;
// Verification with exact solution
for(j=0;j<(n);j++)
{
//printf("up[%d]=%.12g\n",j,up[j]);
ue[j]=0.5*(sin(2*PI*(j*dx+c*T))+sin(2*PI*(j*dx-c*T)));
//printf("um[%d]=%.12g\n",j,um[j]);
inner += (ue[j]-up[j])*(ue[j]-up[j]);
L2cpuSolution += ue[j]*ue[j];
L2gpuSolution += up[j]*up[j];
}
L2cpuSolution = sqrt(L2cpuSolution)/n;
L2gpuSolution = sqrt(L2gpuSolution)/n;
L2_exact = sqrt(inner/(n));
ERROR_PERCENTAGE = 100*(L2_exact/L2cpuSolution);
printf("L2_exact=%lf\n",L2_exact);
printf("gpul2=%lf, and cpuL2=%lf \n",L2gpuSolution,L2cpuSolution);
printf("ERROR_PERCENTAGE= %lf\n", ERROR_PERCENTAGE);
// Free device memory
cudaFree(dev_up);
cudaFree(dev_u);
cudaFree(dev_um);
return 0;
}
答案 0 :(得分:3)
基本上,我不认为你在这里做错了什么。但是,你不应该期望CUDA为你做魔术并且加载比CPU实现更快。特别是作为一维波动方程(在CPU实现中只是一个单循环)的相对微不足道的东西对于现代计算机来说非常简单,没有理由将它并行化。因为请记住:从主机到设备再向后传输数据可能是GPU实现性能的瓶颈。因此,除非您的数据量很大(比如n> 10 ^ 6左右),我认为这不值得。
但是,改进内核中代码的一种方法是预先计算一些变量。变量dx
,dt
,r
和R
似乎在整个模拟过程中保持不变,但每个时间步都有每个小线程计算它们。所以这可能是数以百万计的多余计算。此外,使用纹理内存作为数组数据有可能提高速度,因为大多数内存访问都发生在每个块的相同邻域中。
答案 1 :(得分:2)
上述评论中的主要问题和讨论是,当实施C / C ++并在顺序机器上运行时,一维有限差分时域(FDTD)方法是否更快,而不是在CUDA中实现并在并行GPU上运行。
我试图用下面的代码回答这个问题。它包含用于C / C ++和CUDA中的电磁应用的1D FDTD方法的实现。理论和C / C ++实现取自Understanding the Finite-Difference Time-Domain Method(参见程序3.1)。 CUDA版本包含两种方法,一种仅使用全局内存,另一种使用共享内存。在后一种情况下,我通过启动两个不同的内核来强制磁场和电场更新之间的同步。
对于足够大的问题(SIZE = 10000000
),GPU版本确实比CPU版本快。我在Kepler K20c卡上测试了代码,结果如下:
Shared Memory version
CPU elapsed time = 3980.763 ms
GPU elapsed time = 356.828 ms
Global Memory version
GPU elapsed time = 359.768 ms
使用共享内存的版本不会改善场景。
以下是代码:
<强> kernel.cu 强>
/* 1D FDTD simulation with an additive source. */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "TimingCPU.h"
#include "TimingGPU.cuh"
#define BLOCKSIZE 512
//#define DEBUG
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************/
/* HOST-SIZE FIELD UPDATE FUNCTION */
/***********************************/
void updateHost(double *h_ez, double* h_hy, double imp0, double qTime, const int source, const int N) {
/* update magnetic field */
for (int mm = 0; mm < N - 1; mm++)
h_hy[mm] = h_hy[mm] + (h_ez[mm + 1] - h_ez[mm]) / imp0;
/* update electric field */
for (int mm = 1; mm < N; mm++)
h_ez[mm] = h_ez[mm] + (h_hy[mm] - h_hy[mm - 1]) * imp0;
/* use additive source at node 50 */
h_ez[source] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);
}
/********************************************************/
/* DEVICE-SIZE FIELD UPDATE FUNCTION - NO SHARED MEMORY */
/********************************************************/
__global__ void updateDevice_v0(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
/* update magnetic field */
if (tid < N-1) d_hy[tid] = d_hy[tid] + (d_ez[tid + 1] - d_ez[tid]) / imp0;
__threadfence();
/* update electric field */
if ((tid < N)&&(tid > 0)) d_ez[tid] = d_ez[tid] + (d_hy[tid] - d_hy[tid - 1]) * imp0;
/* use additive source at node 50 */
if (tid == source) d_ez[tid] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);
}
/**************************************************************/
/* DEVICE-SIZE MAGNETIC FIELD UPDATE FUNCTION - SHARED MEMORY */
/**************************************************************/
__global__ void updateDevice_hy(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ double hy_temp[BLOCKSIZE+1], ez_temp[BLOCKSIZE+1];
hy_temp[threadIdx.x] = d_hy[tid];
ez_temp[threadIdx.x] = d_ez[tid];
if ((threadIdx.x == 0)&&((tid + BLOCKSIZE) < N)) {
ez_temp[BLOCKSIZE] = d_ez[tid + BLOCKSIZE];
hy_temp[BLOCKSIZE] = d_hy[tid + BLOCKSIZE];
}
__syncthreads();
/* update magnetic field */
if (tid < N-1) d_hy[tid] = hy_temp[threadIdx.x] + (ez_temp[threadIdx.x + 1] - ez_temp[threadIdx.x]) / imp0;
}
/**************************************************************/
/* DEVICE-SIZE ELECTRIC FIELD UPDATE FUNCTION - SHARED MEMORY */
/**************************************************************/
__global__ void updateDevice_ez(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ double hy_temp[BLOCKSIZE+1], ez_temp[BLOCKSIZE+1];
hy_temp[threadIdx.x + 1] = d_hy[tid];
ez_temp[threadIdx.x + 1] = d_ez[tid];
if ((threadIdx.x == 0)&&(tid >= 1)) {
ez_temp[0] = d_ez[tid - 1];
hy_temp[0] = d_hy[tid - 1];
}
__syncthreads();
/* update electric field */
ez_temp[threadIdx.x] = ez_temp[threadIdx.x + 1] + (hy_temp[threadIdx.x + 1] - hy_temp[threadIdx.x]) * imp0;
/* use additive source at node 50 */
if (tid == source) ez_temp[threadIdx.x] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);
if ((tid < N)&&(tid > 0)) d_ez[tid] = ez_temp[threadIdx.x];
}
/********/
/* MAIN */
/********/
int main() {
// --- Problem size
const int SIZE = 10000000;
// --- Free-space wave impedance
double imp0 = 377.0;
// --- Maximum number of iterations (must be less than the problem size)
int maxTime = 100;
// --- Source location
int source = SIZE / 2;
// --- Host side memory allocations and initializations
double *h_ez = (double*)calloc(SIZE, sizeof(double));
double *h_hy = (double*)calloc(SIZE, sizeof(double));
// --- Device side memory allocations and initializations
double *d_ez; gpuErrchk(cudaMalloc((void**)&d_ez, SIZE * sizeof(double)));
double *d_hy; gpuErrchk(cudaMalloc((void**)&d_hy, SIZE * sizeof(double)));
gpuErrchk(cudaMemset(d_ez, 0, SIZE * sizeof(double)));
gpuErrchk(cudaMemset(d_hy, 0, SIZE * sizeof(double)));
// --- Host side memory allocations for debugging purposes
#ifdef DEBUG
double *h_ez_temp = (double*)calloc(SIZE, sizeof(double));
double *h_hy_temp = (double*)calloc(SIZE, sizeof(double));
#endif
// --- Host-side time-steppings
#ifndef DEBUG
TimingCPU timerCPU;
timerCPU.StartCounter();
for (int qTime = 0; qTime < maxTime; qTime++) {
updateHost(h_ez, h_hy, imp0, qTime, source, SIZE);
}
printf("CPU elapsed time = %3.3f ms\n", timerCPU.GetCounter());
#endif
TimingGPU timerGPU;
timerGPU.StartCounter();
// --- Device-side time-steppings
for (int qTime = 0; qTime < maxTime; qTime++) {
updateDevice_v0<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
// updateDevice_hy<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
// updateDevice_ez<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_ez_temp, d_ez, SIZE * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_hy_temp, d_hy, SIZE * sizeof(double), cudaMemcpyDeviceToHost));
updateHost(h_ez, h_hy, imp0, qTime, source, SIZE);
for (int i=0; i<SIZE; i++) {
printf("%f %f %f %f\n",h_ez_temp[i], h_ez[i], h_hy_temp[i], h_hy[i]);
}
printf("\n");
#endif
}
printf("GPU elapsed time = %3.3f ms\n", timerGPU.GetCounter());
return 0;
}
<强> TimingCPU.h 强>
#ifndef __TIMINGCPU_H__
#define __TIMINGCPU_H__
#ifdef __linux__
class TimingCPU {
private:
long cur_time_;
public:
TimingCPU();
~TimingCPU();
void StartCounter();
double GetCounter();
};
#elif _WIN32 || _WIN64
struct PrivateTimingCPU;
class TimingCPU
{
private:
PrivateTimingCPU *privateTimingCPU;
public:
TimingCPU();
~TimingCPU();
void StartCounter();
double GetCounter();
}; // TimingCPU class
#endif
#endif
<强> TimingCPU.cpp 强>
/**************/
/* TIMING CPU */
/**************/
#include "TimingCPU.h"
#ifdef __linux__
#include <sys/time.h>
#include <stdio.h>
TimingCPU::TimingCPU(): cur_time_(0) { StartCounter(); }
TimingCPU::~TimingCPU() { }
void TimingCPU::StartCounter()
{
struct timeval time;
if(gettimeofday( &time, 0 )) return;
cur_time_ = 1000000 * time.tv_sec + time.tv_usec;
}
double TimingCPU::GetCounter()
{
struct timeval time;
if(gettimeofday( &time, 0 )) return -1;
long cur_time = 1000000 * time.tv_sec + time.tv_usec;
double sec = (cur_time - cur_time_) / 1000000.0;
if(sec < 0) sec += 86400;
cur_time_ = cur_time;
return 1000.*sec;
}
#elif _WIN32 || _WIN64
#include <windows.h>
#include <iostream>
struct PrivateTimingCPU {
double PCFreq;
__int64 CounterStart;
};
// --- Default constructor
TimingCPU::TimingCPU() { privateTimingCPU = new PrivateTimingCPU; (*privateTimingCPU).PCFreq = 0.0; (*privateTimingCPU).CounterStart = 0; }
// --- Default destructor
TimingCPU::~TimingCPU() { }
// --- Starts the timing
void TimingCPU::StartCounter()
{
LARGE_INTEGER li;
if(!QueryPerformanceFrequency(&li)) std::cout << "QueryPerformanceFrequency failed!\n";
(*privateTimingCPU).PCFreq = double(li.QuadPart)/1000.0;
QueryPerformanceCounter(&li);
(*privateTimingCPU).CounterStart = li.QuadPart;
}
// --- Gets the timing counter in ms
double TimingCPU::GetCounter()
{
LARGE_INTEGER li;
QueryPerformanceCounter(&li);
return double(li.QuadPart-(*privateTimingCPU).CounterStart)/(*privateTimingCPU).PCFreq;
}
#endif
<强> TimingGPU.cuh 强>
#ifndef __TIMING_CUH__
#define __TIMING_CUH__
/**************/
/* TIMING GPU */
/**************/
// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
// microsecond precision.
struct PrivateTimingGPU;
class TimingGPU
{
private:
PrivateTimingGPU *privateTimingGPU;
public:
TimingGPU();
~TimingGPU();
void StartCounter();
void StartCounterFlags();
float GetCounter();
}; // TimingCPU class
#endif
<强> TimingGPU.cu 强>
/**************/
/* TIMING GPU */
/**************/
#include "TimingGPU.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
struct PrivateTimingGPU {
cudaEvent_t start;
cudaEvent_t stop;
};
// default constructor
TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU; }
// default destructor
TimingGPU::~TimingGPU() { }
void TimingGPU::StartCounter()
{
cudaEventCreate(&((*privateTimingGPU).start));
cudaEventCreate(&((*privateTimingGPU).stop));
cudaEventRecord((*privateTimingGPU).start,0);
}
void TimingGPU::StartCounterFlags()
{
int eventflags = cudaEventBlockingSync;
cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
cudaEventRecord((*privateTimingGPU).start,0);
}
// Gets the counter in ms
float TimingGPU::GetCounter()
{
float time;
cudaEventRecord((*privateTimingGPU).stop, 0);
cudaEventSynchronize((*privateTimingGPU).stop);
cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
return time;
}