我想了解CUDA如何管理GPU RAM访问。据我所知,CPU RAM芯片和GPU RAM芯片并没有太大的区别,因此在合理的条件下,我希望它们具有相似的性能。
我写了一个小代码来汇总数组的元素。内存访问可以是顺序访问,也可以是稀疏访问,并且阵列可以存储在CPU RAM或GPU RAM中。总和由CPU和GPU中的1个线程执行。
可以为CPU注释行7编译代码。
#include <iostream>
#include <cassert>
#include <ctime>
using namespace std;
#define GPU true
#ifndef GPU
#define __global__
#define __host__
#define __device__
#endif
template<typename T>
__host__
void memAlloc(T** ptr, int count)
{
#ifdef GPU
cudaMallocManaged(ptr, sizeof(T) * count);
#else
*ptr = static_cast<T*>(malloc(sizeof(T) * count));
#endif
assert(*ptr != nullptr);
}
__host__
int64_t getClockSpeed()
{
int64_t clockSpeed;
#ifdef GPU
int tmp;
cudaDeviceGetAttribute(&tmp,cudaDevAttrClockRate,0);
clockSpeed = static_cast<int64_t>(tmp) * 1000;
#else
clockSpeed = static_cast<uint64_t>(CLOCKS_PER_SEC);
#endif
return clockSpeed / 1000;
}
static __device__ inline uint64_t gpuClock()
{
uint64_t globaltime;
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(globaltime) );
return globaltime;
}
__device__
clock_t getTicks()
{
clock_t ticks;
#ifdef GPU
ticks = static_cast<clock_t>(gpuClock());
#else
ticks = clock();
#endif
return ticks;
}
__host__ __device__
int sparseRead(int32_t* mem, int elementCount, int stepSize, int rounds)
{
int sum;
for(int r = 0; r < rounds; r +=1)
{
sum = 0;
for(int i = 0; i < stepSize; i += 1)
{
for(int j = i; j < elementCount; j += stepSize)
{
sum += mem[j];
}
}
}
return sum;
}
__host__ __device__
int sequentialRead(int32_t* mem, int elementCount, int rounds)
{
int sum;
for(int r = 0; r < rounds; r +=1)
{
sum = 0;
for(int j = 0; j < elementCount; j += 1)
{
sum += mem[j];
}
}
return sum;
}
__global__
void getElapsedTicks(int32_t* mem, int elementCount, int stepSize, int rounds, clock_t* elapsedTicks, int32_t* sum)
{
for(int i = 0; i < elementCount; i +=1)
{
mem[i] = i % rounds;
}
clock_t start = getTicks();
printf("Start %d\n", start);
*sum = sparseRead(mem, elementCount, stepSize, rounds);
//*sum = sequentialRead(mem, elementCount, rounds);
clock_t end = getTicks();
printf("End %d\n", end);
*elapsedTicks = end - start;
}
int main(int argc, char *argv[])
{
int elementCount = atoi(argv[1]);
int stepSize = atoi(argv[2]);
int rounds = atoi(argv[3]);
int32_t* mem;
memAlloc(&mem, elementCount);
clock_t* elapsedTicks;
memAlloc(&elapsedTicks, 1);
int32_t* sum;
memAlloc(&sum, 1);
#ifdef GPU
getElapsedTicks<<<1,1>>>(mem, elementCount, stepSize, rounds, elapsedTicks, sum);
cudaDeviceSynchronize();
#else
getElapsedTicks(mem, elementCount, stepSize, rounds, elapsedTicks, sum);
#endif
uint64_t elapsedTime = *elapsedTicks / getClockSpeed();
cout << "Sum "<< *sum << endl;
cout << "Elapsed " << elapsedTime << " ms" << endl;
}
我预计自从时钟变慢以来,GPU的速度会降低约3倍,但是连续和稀疏总和都将降低100倍。我想念的是什么?