内核的主要思想是为全局内存获取atomicAdd的延迟,因此首先要为一个线程和一个块获取atomicAdd的基本延迟。全局记忆的不同位置之间有太多的时间间隔,为什么?内核如下: kernel.h当
#ifndef _KERNEL_H_
#define _KERNEL_H_
template <class T,class ITYPE>
__global__ void collision(T * y,T * oldVal,ITYPE * interval,ITYPE * time)
{
ITYPE warp,vector_lane,thread_lane,thread_id,partial;
warp = 32;
vector_lane = (blockDim.x+warp-1)/warp;
thread_lane = threadIdx.x & (warp-1);
thread_id = threadIdx.x / warp;
ITYPE threads = threadIdx.x;
ITYPE start_time,end_time;
ITYPE position = 0;
T value = 1.0;
T old = 0.0f;
partial = threadIdx.x & (warp-1);
start_time = clock();
//set different value for variable position
old = atomicAdd(&y[position],value);
end_time = clock();
if (thread_lane==0)
time[blockIdx.x*vector_lane+thread_id]=end_time-start_time;
oldVal[2]=old;
}
template <class T,class ITYPE>
void __collision__(T * y,T * oldVal,ITYPE * interval,ITYPE * time,ITYPE & number_SM)
{
const unsigned int THREADS_PER_BLOCK = 1;
const unsigned int NUM_BLOCKS = 1;
//get the number of multiprocessors
ITYPE dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
number_SM = deviceProp.multiProcessorCount;
printf("multiProcessors=%d\n",number_SM);
if (NUM_BLOCKS<13)
number_SM = NUM_BLOCKS;
printf("THREADS_PER_BLOCK=%d\n",THREADS_PER_BLOCK);
printf("NUM_BLOCKS=%d\n",NUM_BLOCKS);
collision<T,ITYPE><<<NUM_BLOCKS,THREADS_PER_BLOCK>>>(y,oldVal,interval,time);
}
#endif
collision.cu的代码如下:
#include "run.h"
using namespace std;
typedef float VALUETYPE;
typedef int INDEXTYPE;
int main(int argc,char *args[])
{
launtch<VALUETYPE,INDEXTYPE>();
}
run.h的代码如下:
#ifndef _RUN_H_
#define _RUN_H_
#include <stdio.h>
#include <iostream>
#include <string>
#include "kernel.h"
#include <shrQATest.h>
#include <shrUtils.h>
#include <helper_cuda.h>
using namespace std;
template <class T,class ITYPE>
void launtch()
{
const ITYPE LENGTH = 64*208;
ITYPE number_SM = 1;
T * y = new T[LENGTH];
T * oldVal = new T[LENGTH];
ITYPE * interval = new ITYPE[LENGTH];
ITYPE * time = new ITYPE[LENGTH];
memset(y,0.0f,sizeof(T)*LENGTH);
memset(oldVal,0.0f,sizeof(T)*LENGTH);
memset(time,0,sizeof(ITYPE)*LENGTH);
T * dy;
T * dOldVal;
ITYPE * dinterval;
ITYPE * dtime;
checkCudaErrors(cudaMalloc(&dy,LENGTH*sizeof(T)));
checkCudaErrors(cudaMalloc(&dOldVal,LENGTH*sizeof(T)));
checkCudaErrors(cudaMalloc(&dinterval,LENGTH*sizeof(ITYPE)));
checkCudaErrors(cudaMalloc(&dtime,LENGTH*sizeof(ITYPE)));
checkCudaErrors(cudaMemcpy(dy,y,sizeof(T)*LENGTH,cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(dOldVal,oldVal,sizeof(T)*LENGTH,cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(dinterval,interval,sizeof(ITYPE)*LENGTH,cudaMemcpyHostToDevice));
__collision__<T,ITYPE>(dy,dOldVal,dinterval,dtime,number_SM);
checkCudaErrors(cudaMemcpy(time,dtime,LENGTH*sizeof(ITYPE),cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(y,dy,LENGTH*sizeof(T),cudaMemcpyDeviceToHost));
ITYPE sum=0,count=0;
for (ITYPE i=0;i<LENGTH;i++)
{
if (time[i]>0)
{
sum+=time[i];
count++;
cout<<" ["<<i<<"]="<<time[i];
if (count%10==0)
cout<<endl;
}
}
cout<<endl<<"number_SM="<<number_SM<<endl;
cout<<"average="<<sum/number_SM<<endl;
cout<<"y[2]="<<y[2]<<endl;
}
#endif
makefile的详细信息如下:
NVIDIA = /root/NVIDIA_CUDA-5.0_Samples
CUDA = /usr/local/cuda-5.0
#NVIDINCADD = -I$(NVIDIA)/shared/inc -I$(NVIDIA)/C/common/inc
NVIDINCADD = -I$(NVIDIA)/common/inc
CUDAINCADD = -I$(CUDA)/include -I$(CUDA)/shared/inc
CC = -L/usr/lib64 -lstdc++
GCCOPT = -O2 -fno-rtti -fno-exceptions
INTELOPT = -O3 -fno-rtti -xW -restrict -fno-alias
#DEB = -g
#NVCC = -G
#ARCH = -arch=sm_13
ARCH = -arch=sm_35
collision:collision.cu
nvcc $(DEB) $(NVCC) $(ARCH) -lm $(NVIDINCADD) $(CUDAINCADD) -o $(@) $(<)
clean:
rm -f collision
rm -f a.out
如果position的值为0,则time [0]的值为46;位置为2,时间[0]为369.平台为K20M和CUDA 5.0。
答案 0 :(得分:2)
哇,大量的代码与你想要展示的内容大多无关。下次尝试消除不必要的部分。
此外,您将float
值作为第二个参数传递给memset
。 memset
设置了byte
个数量,并且在第二个参数中需要unsigned char
。
使用您的代码,我能够在position
值0和2之间重现一些变化。对于0情况,我得到76的时间,对于2情况,我得到118的时间,所以没有你的变化那么大。
但是,因为您正在进行更改然后重新编译代码,编译器可以为每种情况发出不同的指令流,使结果看起来不同。
我建议您尝试使用此代码:
#include <iostream>
#define DWIDTH 32
typedef float mytype;
template <typename T>
__global__ void collision(int *time, T *data, T *old ){
for (int i = 0; i < DWIDTH; i++){
unsigned long start_time = clock64();
T my_old = atomicAdd(data+i, (T) 1);
unsigned long end_time = clock64();
time[i] = end_time - start_time;
old[i] = my_old;
}
}
int main(){
mytype *h_data, *d_data;
int *h_time, *d_time;
mytype *h_old, *d_old;
cudaMalloc((void **)&d_time, DWIDTH*sizeof(int));
h_time = (int *)malloc(DWIDTH*sizeof(int));
cudaMalloc((void **)&d_data, DWIDTH*sizeof(mytype));
h_data = (mytype *)malloc(DWIDTH*sizeof(mytype));
cudaMalloc((void **)&d_old, DWIDTH*sizeof(mytype));
h_old = (mytype *)malloc(DWIDTH*sizeof(mytype));
for (int i=0; i<DWIDTH; i++){
h_time[i] = 0;
h_data[i] = (mytype) 0;
}
cudaMemcpy(d_data, h_data, DWIDTH*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_time, h_time, DWIDTH*sizeof(int), cudaMemcpyHostToDevice);
collision<<<1,1>>>(d_time, d_data, d_old);
cudaMemcpy(h_time, d_time, DWIDTH*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_data, d_data, DWIDTH*sizeof(mytype), cudaMemcpyDeviceToHost);
cudaMemcpy(h_old, d_old, DWIDTH*sizeof(mytype), cudaMemcpyDeviceToHost);
std::cout << "times:" << std::endl;
for (int i = 0; i < DWIDTH; i++)
std::cout << h_time[i] << " ";
std::cout << std::endl << "data:" << std::endl;
for (int i = 0; i < DWIDTH; i++)
std::cout << h_data[i] << " ";
std::cout << std::endl << "old:" << std::endl;
for (int i = 0; i < DWIDTH; i++)
std::cout << h_old[i] << " ";
std::cout << std::endl;
return 0;
}
当我为sm_35
编译并在K20m上运行时,我得到:
$ nvcc -arch=sm_35 -o t284 t284.cu
$ ./t284
times:
98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98
data:
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
old:
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
$
此代码的好处是编译器没有机会根据我是否将position
设置为0或2来发出不同的指令流。因此,我得到了一致的结果。
使用您的代码,当我使用position
(和sm_20
)的非零值进行编译时,我得到这样的结果:
/*0038*/ /*0x40011c042c000001*/ S2R R4, SR_ClockLo;
/*0040*/ /*0x04411e036000c000*/ SHL.W R4, R4, 0x1;
/*0048*/ /*0x80015de428004000*/ MOV R5, c [0x0] [0x20];
/*0050*/ /*0x10519c034801c000*/ IADD R6.CC, R5, 0x4;
/*0058*/ /*0x00015de218fe0000*/ MOV32I R5, 0x3f800000;
/*0060*/ /*0x93f1dc4348004000*/ IADD.X R7, RZ, c [0x0] [0x24];
/*0068*/ /*0x00615e056c7e2800*/ ATOM.E.ADD.F32.FTZ.RN R5, [R6], R5;
/*0070*/ /*0x40019c042c000001*/ S2R R6, SR_ClockLo;
当我使用position
(和sm_20
)的零值进行编译时,我得到这样的结果:
/*0048*/ /*0x40019c042c000001*/ S2R R6, SR_ClockLo;
/*0050*/ /*0x04619e036000c000*/ SHL.W R6, R6, 0x1;
/*0058*/ /*0x0001dde218fe0000*/ MOV32I R7, 0x3f800000;
/*0060*/ /*0x0021de056c7e1000*/ ATOM.E.ADD.F32.FTZ.RN R2, [R2], R7;
/*0068*/ /*0x4000dc042c000001*/ S2R R3, SR_ClockLo;
因此我们可以看到,使用您的代码,position
的值可能会对生成的代码产生影响,从而影响时间。