我的 3D Laplacian 解算器有效。我获得了350 Gflop / s的功率,我正在尝试升级它以获得更好的性能和两倍的块。 但是,表演仍然是350 Gflop / s:
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <ctime>
#include"res3dcb.cuh"
#include <math.h>
using namespace std;
// Constant statement.
const int blocksize=32;
const int N=128;
const int size=(N+2)*(N+2)*(N+2)*sizeof(float);
// Let's start the main program.
int main(void) {
// Variable statement.
float time1,time2,time3;
float *x_d, *y_d;
float *x,*y;
float gflops;
float NumOps;
int power=4; // You can change power as you prefer (but keep 2^x)
// Init x and y.
x = new float[size];
y = new float[size];
for (int k=1;k<N+1;k++)
for (int i=1;i<N+1;i++)
for (int j=1;j<N+1;j++) {
x[k*(N+2)*(N+2)+i*(N+2)+j]=cos(i+j+k);
}
// Shadow cases.
for (int k=1;k<N+1;k++) {
for (int i=1;i<N+1;i++) {
x[k*(N+2)*(N+2)+i*(N+2)]=x[k*(N+2)*(N+2)+i*(N+2)+1];
x[k*(N+2)*(N+2)+i*(N+2)+N+1]=x[k*(N+2)*(N+2)+i*(N+2)+N];}
for (int j=0;j<N+2;j++) {
x[k*(N+2)*(N+2)+j]=x[k*(N+2)*(N+2)+(N+2)+j];
x[k*(N+2)*(N+2)+(N+1)*(N+2)+j]=x[k*(N+2)*(N+2)+N*(N+2)+j];}
for (int i=0;i<N+2;i++)
for (int j=0;j<N+2;j++) {
x[(N+2)*i+j]=x[(N+2)*(N+2)+(N+2)*i+j];
x[(N+1)*(N+2)*(N+2)+(N+2)*i+j]=x[(N+2)*(N+2)*N+(N+2)*i+j];
}
// Display of initial matrix.
int id_stage=-2;
while (id_stage!=-1) {
cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int i=0;i<N+2;i++) {
cout<<"| ";
for (int j=0;j<N+2;j++) {cout<<x[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
// CPU to GPU.
cudaMalloc( (void**) & x_d, size);
cudaMalloc( (void**) & y_d, size);
cudaMemcpy(x_d, x, size, cudaMemcpyHostToDevice) ;
cudaMemcpy(y_d, y, size, cudaMemcpyHostToDevice) ;
// Solver parameters.
dim3 dimGrid(power*N/blocksize, power*N/blocksize);
dim3 dimBlock(blocksize, blocksize);
// Solver loop.
time1=clock();
res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;
// Power calculation.
NumOps=(1.0e-9)*N*N*N*7;
gflops = ( NumOps / (time3));
// GPU to CPU.
cudaMemcpy(y, y_d, size, cudaMemcpyDeviceToHost);
cudaFree(x_d);
cudaFree(y_d);
// Display of final matrix.
id_stage=-2;
while (id_stage!=-1) {
cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int i=0;i<N+2;i++) {
cout<<"| ";
for (int j=0;j<N+2;j++) {cout<<y[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
cout<<"Time : "<<time3<<endl;
cout<<"Gflops/s : "<<gflops<<endl;
}
其中:
__ global__ void res2d(volatile float* x, float* y, int N, int power)
{
int i = threadIdx.x + blockIdx.x*(blockDim.x);
int j = threadIdx.y + blockIdx.y*(blockDim.y);
int id,jd;
#pragma unroll //Now let's recude the number of operation per block
for (int incr=1; incr<power+1; incr++) {
if (i>(incr-1)*N && i<incr*N && j>(incr-1)*N && j<incr*N) {
#pragma unroll
for (int k=(incr-1)*(N/power) ; k<incr*N/power ; k++) {
id=i-(incr-1)*N;
jd=j-(incr-1)*N;
y[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1] = x[(N+2)*(N+2)*(k+1)+(N+2)*(id+2)+jd+1]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*id+jd+1]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+2]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd]
+ x[(N+2)*(N+2)*(k+2)+(N+2)*(id+1)+jd+1]
+ x[(N+2)*(N+2)*k+(N+2)*(id+1)+jd+1]
- 6*x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1];
}
}
}
}
带参数:
dimGrid(power * N/blocksize, power * N/blocksize) & dimBlock(blocksize, blocksize)
如果power
= 2
,4
或8
,则每个块的操作数除以2
,4
或8
。但它并不快。为什么呢?
减少每个块的操作次数是没用的吗?
提前感谢您的帮助。
答案 0 :(得分:1)
CUDA内核启动是异步的。当你这样做时:
// Solver loop.
time1=clock();
res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;
计时器仅捕获API启动延迟,而不是代码的实际执行时间。这就是为什么改变内核中完成的工作量显然对性能没有影响 - 你的计时方法是不正确的。
做这样的事情:
// Solver loop.
time1=clock();
res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
cudaDeviceSynchronize();
time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;
这会插入一个阻塞调用,这将确保在测量时间之前内核已完成执行。
[此答案作为社区wiki条目添加,以便将问题从未答复的队列中删除]。