我内核的FLOP效率非常低

时间:2016-01-26 11:51:27

标签: performance cuda gpu gpgpu

我为压力梯度计算实现了一个内核。根据我使用的算法和之前的部分,ngb_list[]是随机的,我没有合并的内存访问。然而,内核的双精度FLOP效率是TESLA K40峰值性能的0.2%。看起来很低......!

此外:

全球内存负载效率:45.05%
全球内存存储效率:100.00%

有没有办法提高DP FLOP效率和全局内存负载效率?

您可以在这里找到代码:

#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/sequence.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#include <iostream>
#include <time.h>
#include <cmath>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>

    typedef double  Float;

__global__ void pGrad_calculator(Float* pressure, Float* pressure_list,
                                 Float* interactionVectors_x, Float* interactionVectors_y, Float* interactionVectors_z,
                                 int* ngb_offset, int* ngb_size, int* ngb_list,
                                 Float* pressureGrad_x, Float* pressureGrad_y, Float* pressureGrad_z,
                                 int num){

    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < num){

        for (int j = ngb_offset[idx]; j < (ngb_offset[idx] + ngb_size[idx]); j++){
            Float pij = (pressure[idx] + pressure[ngb_list[j]]);
            pressureGrad_x[idx] += interactionVectors_x[j] * pij; 
            pressureGrad_y[idx] += interactionVectors_y[j] * pij;
            pressureGrad_z[idx] += interactionVectors_z[j] * pij;
        }
        pressureGrad_x[idx] *= 0.5;
        pressureGrad_y[idx] *= 0.5;
        pressureGrad_z[idx] *= 0.5;
    }
}

int main(){

    const int num = 1 << 20;
    const int tb = 1024;
    int bg = (num + tb - 1) / tb;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    //ngb_size
    thrust::device_vector<int> ngb_size(num,27);

    //ngb_offset
    thrust::device_vector<int> ngb_offset(num);
    thrust::exclusive_scan(ngb_size.begin(),ngb_size.end(), ngb_offset.begin());

    //ngh list
    int ngbSize = thrust::reduce(ngb_size.begin(),ngb_size.end());

    std::cout << "ngbSize" << ngbSize << std::endl;

    thrust::device_vector<int> ngb_list(ngbSize);

    srand((unsigned)time(NULL));
    for (int i = 0; i < num; i++){

        int R = (rand()%(num - 0)) + 0;

        ngb_list[i] = R;
    }

    //pressure
    thrust::device_vector<Float> d_pressure(num);
    thrust::sequence(d_pressure.begin(),d_pressure.end(),1);

    //interaction vectors
    thrust::device_vector<Float> d_xInteractionVectors(ngbSize,1);
    thrust::device_vector<Float> d_yInteractionVectors(ngbSize,0);
    thrust::device_vector<Float> d_zInteractionVectors(ngbSize,0);

    //pressure gradients
    thrust::device_vector<Float> pGradx(num);
    thrust::device_vector<Float> pGrady(num);
    thrust::device_vector<Float> pGradz(num);

    //Pressure list
    thrust::device_vector<Float> pressure_list(ngbSize,0);

    cudaEventRecord(start);
    pGrad_calculator<<<bg,tb>>>(thrust::raw_pointer_cast(&d_pressure[0]),
                                thrust::raw_pointer_cast(&pressure_list[0]),
                                thrust::raw_pointer_cast(&d_xInteractionVectors[0]),
                                thrust::raw_pointer_cast(&d_yInteractionVectors[0]),
                                thrust::raw_pointer_cast(&d_zInteractionVectors[0]),
                                thrust::raw_pointer_cast(&ngb_offset[0]),
                                thrust::raw_pointer_cast(&ngb_size[0]),
                                thrust::raw_pointer_cast(&ngb_list[0]),
                                thrust::raw_pointer_cast(&pGradx[0]),
                                thrust::raw_pointer_cast(&pGrady[0]),
                                thrust::raw_pointer_cast(&pGradz[0]),
                                num);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    std::cout << "KERNEL TIME =  " << milliseconds << "  milliseconds" << std::endl;

    return 0;
}

1 个答案:

答案 0 :(得分:1)

可能编译器无法正确优化您的代码,并且会使用许多额外的加载和写入。

尝试编写如下代码:

__global__ void pGrad_calculator(Float* pressure, Float* pressure_list,
                                 Float* interactionVectors_x, 
                                 Float* interactionVectors_y, 
                                 Float* interactionVectors_z,
                                 int* ngb_offset, int* ngb_size, 
                                 int* ngb_list, Float* pressureGrad_x, 
                                 Float* pressureGrad_y, Float* pressureGrad_z,
                                 int num)
{
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < num){
        Float x=pressureGrad_x[idx];
        Float y=pressureGrad_y[idx];
        Float z=pressureGrad_z[idx];
        Float pressure_local=pressure[idx];
        int offset=ngb_offset[idx];
        int end=offset+ngb_size[idx];
        for (int j = offset; j < end; j++){
            Float pij = (pressure_local + pressure[ngb_list[j]]);
            x += interactionVectors_x[j] * pij; 
            y += interactionVectors_y[j] * pij;
            z += interactionVectors_z[j] * pij;
        }
        pressureGrad_x[idx] = 0.5*x;
        pressureGrad_y[idx] = 0.5*y;
        pressureGrad_z[idx] = 0.5*z;
    }
}

但即使采用这种优化方式,您也很可能无法接近峰值触发器,因为您将受到内存带宽的限制。每个double / 8字节执行少于两个浮点运算。这使您在峰值触发器上限为288 GB / s * 2 FOps / 8字节= 72 GFlop / s或约5%峰值。