CPU与GPU内存性能

时间:2019-06-29 10:49:06

标签: performance cuda microbenchmark memory-access

我想了解CUDA如何管理GPU RAM访问。据我所知,CPU RAM芯片和GPU RAM芯片并没有太大的区别,因此在合理的条件下,我希望它们具有相似的性能。

我写了一个小代码来汇总数组的元素。内存访问可以是顺序访问,也可以是稀疏访问,并且阵列可以存储在CPU RAM或GPU RAM中。总和由CPU和GPU中的1个线程执行。

可以为CPU注释行7编译代码。

#include <iostream>
#include <cassert>
#include <ctime>

using namespace std;

#define GPU true

#ifndef GPU
#define __global__
#define __host__
#define __device__
#endif

template<typename T>
__host__ 
void memAlloc(T** ptr, int count)
{
#ifdef GPU
    cudaMallocManaged(ptr, sizeof(T) * count);
#else
    *ptr = static_cast<T*>(malloc(sizeof(T) * count));
#endif
    assert(*ptr != nullptr);
}

__host__
int64_t getClockSpeed()
{
    int64_t clockSpeed;
#ifdef GPU
    int tmp;
    cudaDeviceGetAttribute(&tmp,cudaDevAttrClockRate,0);
    clockSpeed = static_cast<int64_t>(tmp) * 1000;
#else
    clockSpeed = static_cast<uint64_t>(CLOCKS_PER_SEC);
#endif    
    return clockSpeed / 1000;
}

static __device__ inline uint64_t gpuClock()
{
    uint64_t globaltime;
    asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(globaltime) );
    return globaltime;
}

__device__
clock_t getTicks()
{
    clock_t ticks;
#ifdef GPU
    ticks = static_cast<clock_t>(gpuClock());
#else
    ticks = clock();
#endif
    return ticks;
}

__host__ __device__
int sparseRead(int32_t* mem, int elementCount, int stepSize, int rounds)
{
    int sum;

    for(int r = 0; r < rounds; r +=1)
    {
        sum = 0;
        for(int i = 0; i < stepSize; i += 1)
        {
            for(int j = i; j < elementCount; j += stepSize)
            {
                sum += mem[j];
            }
        }
    }

    return sum;
}

__host__ __device__
int sequentialRead(int32_t* mem, int elementCount, int rounds)
{
    int sum;

    for(int r = 0; r < rounds; r +=1)
    {
        sum = 0;
        for(int j = 0; j < elementCount; j += 1)
        {
            sum += mem[j];
        }   
    }

    return sum;
}

__global__
void getElapsedTicks(int32_t* mem, int elementCount, int stepSize, int rounds, clock_t* elapsedTicks, int32_t* sum)
{
    for(int i = 0; i < elementCount; i +=1)
    {
        mem[i] = i % rounds;
    }

    clock_t start = getTicks();
    printf("Start %d\n", start);

    *sum = sparseRead(mem, elementCount, stepSize, rounds);
    //*sum = sequentialRead(mem, elementCount, rounds);

    clock_t end =  getTicks();
    printf("End %d\n", end);

    *elapsedTicks = end - start;
}

int main(int argc, char *argv[]) 
{
    int elementCount = atoi(argv[1]);
    int stepSize = atoi(argv[2]);
    int rounds = atoi(argv[3]);

    int32_t* mem;
    memAlloc(&mem, elementCount);

    clock_t* elapsedTicks;
    memAlloc(&elapsedTicks, 1);

    int32_t* sum;
    memAlloc(&sum, 1);

#ifdef GPU
    getElapsedTicks<<<1,1>>>(mem, elementCount, stepSize, rounds, elapsedTicks, sum); 
    cudaDeviceSynchronize();
#else
    getElapsedTicks(mem, elementCount, stepSize, rounds, elapsedTicks, sum); 
#endif

    uint64_t elapsedTime = *elapsedTicks / getClockSpeed();
    cout << "Sum "<< *sum << endl;
    cout << "Elapsed " << elapsedTime << " ms" << endl;
} 

我预计自从时钟变慢以来,GPU的速度会降低约3倍,但是连续和稀疏总和都将降低100倍。我想念的是什么?

0 个答案:

没有答案