我为压力梯度计算实现了一个内核。根据我使用的算法和之前的部分,ngb_list[]
是随机的,我没有合并的内存访问。然而,内核的双精度FLOP效率是TESLA K40峰值性能的0.2%。看起来很低......!
此外:
全球内存负载效率:45.05%
全球内存存储效率:100.00%
有没有办法提高DP FLOP效率和全局内存负载效率?
您可以在这里找到代码:
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/sequence.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#include <iostream>
#include <time.h>
#include <cmath>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
typedef double Float;
__global__ void pGrad_calculator(Float* pressure, Float* pressure_list,
Float* interactionVectors_x, Float* interactionVectors_y, Float* interactionVectors_z,
int* ngb_offset, int* ngb_size, int* ngb_list,
Float* pressureGrad_x, Float* pressureGrad_y, Float* pressureGrad_z,
int num){
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num){
for (int j = ngb_offset[idx]; j < (ngb_offset[idx] + ngb_size[idx]); j++){
Float pij = (pressure[idx] + pressure[ngb_list[j]]);
pressureGrad_x[idx] += interactionVectors_x[j] * pij;
pressureGrad_y[idx] += interactionVectors_y[j] * pij;
pressureGrad_z[idx] += interactionVectors_z[j] * pij;
}
pressureGrad_x[idx] *= 0.5;
pressureGrad_y[idx] *= 0.5;
pressureGrad_z[idx] *= 0.5;
}
}
int main(){
const int num = 1 << 20;
const int tb = 1024;
int bg = (num + tb - 1) / tb;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//ngb_size
thrust::device_vector<int> ngb_size(num,27);
//ngb_offset
thrust::device_vector<int> ngb_offset(num);
thrust::exclusive_scan(ngb_size.begin(),ngb_size.end(), ngb_offset.begin());
//ngh list
int ngbSize = thrust::reduce(ngb_size.begin(),ngb_size.end());
std::cout << "ngbSize" << ngbSize << std::endl;
thrust::device_vector<int> ngb_list(ngbSize);
srand((unsigned)time(NULL));
for (int i = 0; i < num; i++){
int R = (rand()%(num - 0)) + 0;
ngb_list[i] = R;
}
//pressure
thrust::device_vector<Float> d_pressure(num);
thrust::sequence(d_pressure.begin(),d_pressure.end(),1);
//interaction vectors
thrust::device_vector<Float> d_xInteractionVectors(ngbSize,1);
thrust::device_vector<Float> d_yInteractionVectors(ngbSize,0);
thrust::device_vector<Float> d_zInteractionVectors(ngbSize,0);
//pressure gradients
thrust::device_vector<Float> pGradx(num);
thrust::device_vector<Float> pGrady(num);
thrust::device_vector<Float> pGradz(num);
//Pressure list
thrust::device_vector<Float> pressure_list(ngbSize,0);
cudaEventRecord(start);
pGrad_calculator<<<bg,tb>>>(thrust::raw_pointer_cast(&d_pressure[0]),
thrust::raw_pointer_cast(&pressure_list[0]),
thrust::raw_pointer_cast(&d_xInteractionVectors[0]),
thrust::raw_pointer_cast(&d_yInteractionVectors[0]),
thrust::raw_pointer_cast(&d_zInteractionVectors[0]),
thrust::raw_pointer_cast(&ngb_offset[0]),
thrust::raw_pointer_cast(&ngb_size[0]),
thrust::raw_pointer_cast(&ngb_list[0]),
thrust::raw_pointer_cast(&pGradx[0]),
thrust::raw_pointer_cast(&pGrady[0]),
thrust::raw_pointer_cast(&pGradz[0]),
num);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "KERNEL TIME = " << milliseconds << " milliseconds" << std::endl;
return 0;
}
答案 0 :(得分:1)
可能编译器无法正确优化您的代码,并且会使用许多额外的加载和写入。
尝试编写如下代码:
__global__ void pGrad_calculator(Float* pressure, Float* pressure_list,
Float* interactionVectors_x,
Float* interactionVectors_y,
Float* interactionVectors_z,
int* ngb_offset, int* ngb_size,
int* ngb_list, Float* pressureGrad_x,
Float* pressureGrad_y, Float* pressureGrad_z,
int num)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num){
Float x=pressureGrad_x[idx];
Float y=pressureGrad_y[idx];
Float z=pressureGrad_z[idx];
Float pressure_local=pressure[idx];
int offset=ngb_offset[idx];
int end=offset+ngb_size[idx];
for (int j = offset; j < end; j++){
Float pij = (pressure_local + pressure[ngb_list[j]]);
x += interactionVectors_x[j] * pij;
y += interactionVectors_y[j] * pij;
z += interactionVectors_z[j] * pij;
}
pressureGrad_x[idx] = 0.5*x;
pressureGrad_y[idx] = 0.5*y;
pressureGrad_z[idx] = 0.5*z;
}
}
但即使采用这种优化方式,您也很可能无法接近峰值触发器,因为您将受到内存带宽的限制。每个double / 8字节执行少于两个浮点运算。这使您在峰值触发器上限为288 GB / s * 2 FOps / 8字节= 72 GFlop / s或约5%峰值。