我在调试时遇到了这个奇怪的问题。
在我的代码中,我可以通过两种方式初始化主机数组srcArr_h[totArrElm]
:
1)
for(int ic=0; ic<totArrElm; ic++)
{
srcArr_h[ic] = (float)(rand() % 256);
}
或
2)(半数组元素将在运行时设置为零)
for(int ic=0; ic<totArrElm; ic++)
{
int randV = (rand() % 256);
srcArr_h[ic] = randV%2;
}
如果我使用这些数组作为内核函数的输入,我会得到截然不同的时序。特别是如果totArrElm = ARRDIM*ARRDIM
与ARRDIM = 8192
,我会得到
Timimg 1) 64599.3 ms
Timimg 2) 9764.1 ms
诀窍是什么?当然我确实验证了src主机初始化不会影响我得到的大时差。这对我来说听起来非常严格,但可能是因为在运行时进行了优化吗?
这是我的代码:
#include <string>
#include <stdint.h>
#include <iostream>
#include <stdio.h>
using namespace std;
#define ARRDIM 8192
__global__ void gpuKernel
(
float *sa, float *aux,
size_t memPitchAux, int w,
float *c_glob
)
{
float c_loc[256];
float sc_loc[256];
float g0=0.0f;
int tidx = blockIdx.x * blockDim.x + threadIdx.x; // x-coordinate of pixel = column in device memory
int tidy = blockIdx.y * blockDim.y + threadIdx.y; // y-coordinate of pixel = row in device memory
int idx = tidy * memPitchAux/4 + tidx;
for(int ic=0; ic<256; ic++)
{
c_loc[ic] = 0.0f;
}
for(int ic=0; ic<255; ic++)
{
sc_loc[ic] = 0.0f;
}
for(int is=0; is<255; is++)
{
int ic = fabs(sa[tidy*w +tidx]);
c_loc[ic] += 1.0f;
}
for(int ic=0; ic<255; ic++)
{
g0 += c_loc[ic];
}
aux[idx] = g0;
}
int main(int argc, char* argv[])
{
float time, loop_time;
cudaEvent_t start, stop;
cudaEvent_t start_loop, stop_loop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0) ;
/*
* array src host and device
*/
int heightSrc = ARRDIM;
int widthSrc = ARRDIM;
cudaSetDevice(0);
float *srcArr_h, *srcArr_d;
size_t nBytesSrcArr = sizeof(float)*heightSrc * widthSrc;
srcArr_h = (float *)malloc(nBytesSrcArr); // Allocate array on host
cudaMalloc((void **) &srcArr_d, nBytesSrcArr); // Allocate array on device
cudaMemset((void*)srcArr_d,0,nBytesSrcArr); // set to zero
int totArrElm = heightSrc*widthSrc;
cudaEventCreate(&start_loop);
cudaEventCreate(&stop_loop);
cudaEventRecord(start_loop, 0) ;
for(int ic=0; ic<totArrElm; ic++)
{
srcArr_h[ic] = (float)(rand() % 256); // case 1)
// int randV = (rand() % 256); // case 2)
// srcArr_h[ic] = randV%2;
}
cudaEventRecord(stop_loop, 0);
cudaEventSynchronize(stop_loop);
cudaEventElapsedTime(&loop_time, start_loop, stop_loop);
printf("Timimg LOOP: %3.1f ms\n", loop_time);
cudaMemcpy( srcArr_d, srcArr_h,nBytesSrcArr,cudaMemcpyHostToDevice);
/*
* auxiliary buffer auxD to save final results
*/
float *auxD;
size_t auxDPitch;
cudaMallocPitch((void**)&auxD,&auxDPitch,widthSrc*sizeof(float),heightSrc);
cudaMemset2D(auxD, auxDPitch, 0, widthSrc*sizeof(float), heightSrc);
/*
* auxiliary buffer auxH allocation + initialization on host
*/
size_t auxHPitch;
auxHPitch = widthSrc*sizeof(float);
float *auxH = (float *) malloc(heightSrc*auxHPitch);
/*
* kernel launch specs
*/
int thpb_x = 16;
int thpb_y = 16;
int blpg_x = (int) widthSrc/thpb_x + 1;
int blpg_y = (int) heightSrc/thpb_y +1;
int num_threads = blpg_x * thpb_x + blpg_y * thpb_y;
/* c_glob array */
int cglob_w = 256;
int cglob_h = num_threads;
float *c_glob_d;
size_t c_globDPitch;
cudaMallocPitch((void**)&c_glob_d,&c_globDPitch,cglob_w*sizeof(float),cglob_h);
cudaMemset2D(c_glob_d, c_globDPitch, 0, cglob_w*sizeof(float), cglob_h);
/*
* kernel launch
*/
dim3 dimBlock(thpb_x,thpb_y, 1);
dim3 dimGrid(blpg_x,blpg_y,1);
gpuKernel<<<dimGrid,dimBlock>>>(srcArr_d,auxD, auxDPitch, widthSrc, c_glob_d);
cudaThreadSynchronize();
cudaMemcpy2D(auxH,auxHPitch, // to CPU (host)
auxD,auxDPitch, // from GPU (device)
auxHPitch, heightSrc, // size of data (image)
cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Timimg: %3.1f ms\n", time);
cudaFree(srcArr_d);
cudaFree(auxD);
cudaFree(c_glob_d);
}
我的Makefile:
# OS Name (Linux or Darwin)
OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])
# Flags to detect 32-bit or 64-bit OS platform
OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/")
OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/")
# These flags will override any settings
ifeq ($(i386),1)
OS_SIZE = 32
OS_ARCH = i686
endif
ifeq ($(x86_64),1)
OS_SIZE = 64
OS_ARCH = x86_64
endif
# Flags to detect either a Linux system (linux) or Mac OSX (darwin)
DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
# Location of the CUDA Toolkit binaries and libraries
CUDA_PATH ?= /usr/local/cuda-5.0
CUDA_INC_PATH ?= $(CUDA_PATH)/include
CUDA_BIN_PATH ?= $(CUDA_PATH)/bin
ifneq ($(DARWIN),)
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib
else
ifeq ($(OS_SIZE),32)
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib
else
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib64
endif
endif
# Common binaries
NVCC ?= $(CUDA_BIN_PATH)/nvcc
GCC ?= g++
# Extra user flags
EXTRA_NVCCFLAGS ?=
EXTRA_LDFLAGS ?=
EXTRA_CCFLAGS ?=
# CUDA code generation flags
# GENCODE_SM10 := -gencode arch=compute_10,code=sm_10
# GENCODE_SM20 := -gencode arch=compute_20,code=sm_20
# GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
GENCODE_SM10 := -gencode arch=compute_10,code=sm_10
GENCODE_SM20 := -gencode arch=compute_20,code=sm_20
GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
#GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM10)
GENCODE_FLAGS := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30)
# OS-specific build flags
ifneq ($(DARWIN),)
LDFLAGS := -Xlinker -rpath $(CUDA_LIB_PATH) -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -arch $(OS_ARCH)
else
ifeq ($(OS_SIZE),32)
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -m32
else
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
CCFLAGS := -m64
endif
endif
# OS-architecture specific flags
ifeq ($(OS_SIZE),32)
NVCCFLAGS := -m32
else
NVCCFLAGS := -m64
endif
# OpenGL specific libraries
ifneq ($(DARWIN),)
# Mac OSX specific libraries and paths to include
LIBPATH_OPENGL := -L../../common/lib/darwin -L/System/Library/Frameworks/OpenGL.framework/Libraries -framework GLUT -lGL -lGLU ../../common/lib/darwin/libGLEW.a
else
# Linux specific libraries and paths to include
LIBPATH_OPENGL := -L../../common/lib/linux/$(OS_ARCH) -L/usr/X11R6/lib -lGL -lGLU -lX11 -lXi -lXmu -lglut -lGLEW -lrt
endif
# Debug build flags
ifeq ($(dbg),1)
CCFLAGS += -g
NVCCFLAGS += -g -G
TARGET := debug
else
TARGET := release
endif
# Common includes and paths for CUDA
INCLUDES := -I$(CUDA_INC_PATH) -I. -I.. -I../../common/inc
LDFLAGS += $(LIBPATH_OPENGL)
# Target rules
all: build
build: stackOverflow
stackOverflow.o: stackOverflow.cu
$(NVCC) $(NVCCFLAGS) $(EXTRA_NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
stackOverflow: stackOverflow.o
$(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) $(EXTRA_LDFLAGS)
mkdir -p ./bin/$(OSLOWER)/$(TARGET)
cp $@ ./bin/$(OSLOWER)/$(TARGET)
run: build
./stackOverflow
clean:
rm -f stackOverflow.o stackOverflow *.pgm
Cuda 5.0 on Tesla c1060,Ubuntu 12.04。
答案 0 :(得分:1)
Tesla C1060 GPU设备具有计算能力1.3,这意味着每个线程都有128个32位寄存器。它显然不足以适应所有局部变量(2个浮点数,每个256个元素,以及更多变量)。由于在以下行中访问本地内存
c_loc[ic] += 1.0f;
在案例(1)中,高度分布在整个范围0...255
,您可能会观察到寄存器溢出,这意味着您的数据被放入本地内存中。事实上,本地存储器位于全局存储器中,因此具有相同的吞吐量。访问可以缓存,但由于算法的随机性,我打赌缓存效率不高。 (编辑:对于计算能力1.3,它甚至没有缓存,它只是非合并的内存访问)。可以找到有关CUDA中的本地内存和寄存器溢出的良好介绍here。在那里,您还可以找到一些如何检测和解决寄存器溢出问题的指导。
考虑减少每个线程使用的本地数据量或使用位于芯片上的共享内存,因此更快。