CUDA结构对齐正在减慢我的代码(可编译示例)

时间:2015-08-26 17:58:30

标签: cuda memory-alignment

我有一个模拟计算在电场和磁场中移动的带电粒子的3D矢量。 我尝试使用__align__说明符在CUDA中加快速度,认为可能限制因素是全局内存读写,但最终使用__align__放慢速度(可能是因为它增加了总内存需求)。我还尝试使用float3float4,但他们的表现相似

我已经创建了此代码的简化版本并将其粘贴在下方以显示我的问题。 下面的代码应该是可编辑的,并且通过将第四行CASE的定义更改为012,我在上面描述了不同的选项可以尝试。定义了两个函数ParticleMoverCPUParticleMoverGPU来比较CPU与GPU的性能。

  1. 有没有理由说我的内存合并尝试正在减慢而不是加速我的代码?
  2. 还有什么其他的东西显而易见,我没有做到这一点,我可以做到比60倍的加速更好的"令人尴尬的平行"像这样的代码?
  3. 谢谢!

    CPU - Intel Xeon E5620 @ 2.40GHz

    GPU - NVIDIA Tesla C2070

    // CASE 0: Regular struct with 3 floats
    // CASE 1: Aligned struct using __align__(16) with 3 floats
    // CASE 2: float3
    #define CASE        0   // define to either 0, 1 or 2 as described above
    
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <Windows.h>
    
    #include <stdio.h>
    #include <math.h>
    #include <time.h>
    #include <malloc.h>
    #include <sys/stat.h>
    
    #define CEX         10  // x-value of electric field (dimensionless and arbitrary)
    #define CEY         0.1 // y-value of electric field (dimensionless and arbitrary)
    #define CEZ         0.1 // z-value of electric field (dimensionless and arbitrary)
    #define CBX         0.1 // x-value of magnetic field (dimensionless and arbitrary)
    #define CBY         0.1 // x-value of magnetic field (dimensionless and arbitrary)
    #define CBZ         10  // x-value of magnetic field (dimensionless and arbitrary)
    
    #define FACTOR      15  // I played around with these numbers until I got the best speedup
    #define THREADS     256 // I played around with these numbers until I got the best speedup
    
    typedef struct{
        float x;
        float y;
        float z;
    } VecCPU;           //Struct for vectors for CPU calculation
    
    // Fastest method seems to be a regular unaligned struct with 3 floats
    #if CASE==0
    typedef struct {
        float x;
        float y;
        float z;
    } VecGPU;
    #endif
    
    #if CASE==1
    // This method seems to be less fast.  It is an attempt to align for memory coalescence
    typedef struct __align__(16){
        float x;
        float y;
        float z;
    } VecGPU;
    #endif
    
    // Using float3 seems to be about the same as defining our own vector3 structure
    #if CASE==2
    typedef float3 VecGPU;
    #endif
    
    VecCPU *pos_c, *vel_c;                  // global position and velocity vectors for CPU calculation
    __constant__ VecGPU *pos_d, *vel_d;     // pointers in constant memory which we will point to data in global memory
    
    void ParticleMoverCPU(int np, int ts, float dt){
    
        int n = 0;
        while (n < np){
    
            VecCPU vminus, tvec, vprime, vplus;
            float tvec_fact;
            int it = 0;
            while (it < ts){
                // ----- Update velocities by the Boris method ------ //
                vminus.x = vel_c[n].x + CEX*0.5*dt;
                vminus.y = vel_c[n].y + CEY*0.5*dt;
                vminus.z = vel_c[n].z + CEZ*0.5*dt;
                tvec.x = CBX*0.5*dt;
                tvec.y = CBY*0.5*dt;
                tvec.z = CBZ*0.5*dt;
                tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
                vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
                vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
                vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
                vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
                vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
                vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
                vel_c[n].x = vplus.x + CEX*0.5*dt;
                vel_c[n].y = vplus.y + CEY*0.5*dt;
                vel_c[n].z = vplus.z + CEZ*0.5*dt;
    
                // ------ Update Particle positions -------------- //
                pos_c[n].x += vel_c[n].x*dt;
                pos_c[n].y += vel_c[n].y*dt;
                pos_c[n].z += vel_c[n].z*dt;
                it++;
            }
            n++;
        }
    }
    
    __global__ void ParticleMoverGPU(register int np,register int ts, register float dt){
    
        register int n = threadIdx.x + blockDim.x * blockIdx.x;
        while (n < np){
    
            register VecGPU vminus, tvec, vprime, vplus;// , vtemp;
            register float tvec_fact;
            register int it = 0;
            while (it < ts){
                // ----- Update velocities by the Boris method ------ //
                vminus.x = vel_d[n].x + CEX*0.5*dt;
                vminus.y = vel_d[n].y + CEY*0.5*dt;
                vminus.z = vel_d[n].z + CEZ*0.5*dt;
                tvec.x = CBX*0.5*dt;
                tvec.y = CBY*0.5*dt;
                tvec.z = CBZ*0.5*dt;
                tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
                vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
                vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
                vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
                vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
                vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
                vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
                vel_d[n].x = vplus.x + CEX*0.5*dt;
                vel_d[n].y = vplus.y + CEY*0.5*dt;
                vel_d[n].z = vplus.z + CEZ*0.5*dt;
                // ------ Update Particle positions -------------- //
                pos_d[n].x += vel_d[n].x*dt;
                pos_d[n].y += vel_d[n].y*dt;
                pos_d[n].z += vel_d[n].z*dt;
                it++;
            }
            n += blockDim.x*gridDim.x;
        }
    }
    
    int main(void){
    
        int np = 50000;                                         // Number of Particles
        const int ts = 1000;                                    // Number of Time-steps
        const float dt = 1E-3;                                  // Time-step value
    
    
        // ----------- CPU ----------- //
    
        pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for position
        vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for velocity
    
        for (int n = 0; n < np; n++){
            pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0;     // zero out position for CPU variables
            vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0;     // zero out velocity for CPU variables
        }
    
        printf("Starting CPU kernel\n");
        clock_t startCPU;
        float CPUtime;
        startCPU = clock();
        ParticleMoverCPU(np, ts, dt);                           // Launch CPU kernel
        CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
        printf("CPU kernel finished\n");
        // Ouput final CPU computation time
        printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);
    
        // ------------ GPU ----------- //
    
        cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1);    //Set memory preference to L1 (doesn't have much effect)
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, 0);
        int blocks = deviceProp.multiProcessorCount;
    
        VecGPU *pos_g, *vel_g, *pos_l, *vel_l;
    
        pos_g = (VecGPU*)malloc(sizeof(VecGPU)*np);         // allocate memory for positions on the CPU
        vel_g = (VecGPU*)malloc(sizeof(VecGPU)*np);         // allocate memory for velocities on the CPU
    
        cudaMalloc((void**)&pos_l, sizeof(VecGPU)*np);      // allocate memory for positions on the GPU
        cudaMalloc((void**)&vel_l, sizeof(VecGPU)*np);      // allocate memory for velocities on the GPU
    
        cudaMemcpyToSymbol(pos_d, &pos_l, sizeof(void*));   // copy memory address of position to the constant memory pointer pos_d
        cudaMemcpyToSymbol(vel_d, &vel_l, sizeof(void*));   // copy memory address of velocity to the constant memory pointer vel_d
    
        for (int n = 0; n < np; n++){
            pos_g[n].x = 0; pos_g[n].y = 0; pos_g[n].z = 0; // zero out position for GPU variables (before copying to GPU)
            vel_g[n].x = 0; vel_g[n].y = 0; vel_g[n].z = 0; // zero out velocity for GPU variables (before copying to GPU)
        }
    
        cudaMemcpy(pos_l, pos_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice);    // Copy positions to GPU global memory
        cudaMemcpy(vel_l, vel_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice);    // Copy velocities to GPU global memory
    
        printf("Starting GPU kernel\n");
        // start cuda timer
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
    
        ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(np, ts, dt);             // Launch GPU kernel
    
        //stop cuda timer
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        float elapsedTime;
        cudaEventElapsedTime(&elapsedTime, start, stop);
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
        printf("GPU kernel finished\n");
    
        cudaMemcpy(pos_g, pos_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost);    // Copy positions from GPU memory back to CPU
        cudaMemcpy(vel_g, vel_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost);    // Copy velocities from GPU memory back to CPU
    
        // Ouput GPU computation time
        printf("GPUtime = %6.1f ms\n", elapsedTime);
    
        // Output speedup factor
        printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);
    
        // free allocated memory
        cudaFree(pos_l);
        cudaFree(vel_l);
        free(pos_g);
        free(vel_g);
        free(pos_c);
        free(vel_c);
    }
    

    对于CASE 0(常规向量结构),我得到:

    CPUtime = 1302.0 ms
    GPUtime =   21.8 ms
    Speedup = 59.79
    

    对于CASE 1__align__(16)向量结构),我得到:

    CPUtime = 1298.0 ms
    GPUtime =   24.5 ms
    Speedup = 53.08
    

    对于CASE 2(使用float3),我得到:

    CPUtime = 1305.0 ms
    GPUtime =   21.8 ms
    Speedup = 59.80
    

    如果我使用float4代替float3,我会得到与__align__(16)方法类似的内容。

    谢谢!

0 个答案:

没有答案