我是CUDA的新手,我可能做错了什么。
我只需要对两个二进制向量进行逻辑运算。向量长度为2048000.我比较了Matlab的C mex文件和CUDA内核中逻辑and
之间的速度。 CPU上的C比CUDA快约5%。请注意,我只测量了内核执行(没有内存传输)。我有i7 930和9800GT。
##MEX file testCPU.c:##
#include "mex.h"
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[] ) {
int i, varLen;
unsigned char *vars, *output;
vars = mxGetPr(prhs[0]);
plhs[0] = mxCreateLogicalMatrix(2048000, 1);
output = mxGetPr(plhs[0]);
for (i=0;i<2048000;i++){
output[i] = vars[i] & vars[2048000+i];
}
}
编译
mex testCPU.c
创建向量
vars = ~~(randi(2,2048000,2)-1);
测量速度:
tic;testCPU(vars);toc;
CUDA :
#CUDA file testGPU.cu#
#include "mex.h"
#include "cuda.h"
__global__ void logical_and(unsigned char* in, unsigned char* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
out[idx] = in[idx] && in[idx+N];
}
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[] ) {
int i;
unsigned char *vars, *output, *gpu, *gpures;
vars = (unsigned char*)mxGetData(prhs[0]);
plhs[0] = mxCreateLogicalMatrix(2048000, 1);
output = (unsigned char*)mxGetData(plhs[0]);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float dt_ms;
// input GPU malloc
cudaEventRecord(start, 0);
cudaMalloc( (void **) &gpu, sizeof(unsigned char)*4096000);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("GPU input malloc: %f ms, %i\n", dt_ms, cudaGetLastError());
// output GPU malloc
cudaEventRecord(start, 0);
cudaMalloc( (void **) &gpures, sizeof(unsigned char)*2048000);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("GPU output malloc: %f ms, %i\n", dt_ms, cudaGetLastError());
// copy from CPU to GPU
cudaEventRecord(start, 0);
cudaMemcpy( gpu, vars, sizeof(unsigned char)*4096000, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("copy input from CPU to GPU: %f ms, %i\n", dt_ms, cudaGetLastError());
dim3 dimBlock(32);
printf("thread count: %i\n", dimBlock.x);
dim3 dimGrid(2048000/dimBlock.x);
printf("block count: %i\n", dimGrid.x);
// --- KERNEL ---
cudaEventRecord(start, 0);
logical_and<<<dimGrid, dimBlock>>>(gpu, gpures, 2048000);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("GPU kernel: %f ms, %i\n", dt_ms, cudaGetLastError());
// result from GPU to CPU
cudaEventRecord(start, 0);
cudaMemcpy( output, gpures, sizeof(unsigned char)*2048000, cudaMemcpyDeviceToHost );
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&dt_ms, start, stop);
printf("copy output from GPU to CPU: %f ms, %i\n", dt_ms, cudaGetLastError());
cudaFree(gpu);
cudaFree(gpures);
}
编译:
nvmex -f nvmexopts_9.bat testGPU.cu
-I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\include"
-L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\lib\x64" -lcudart -lcufft
输出:
GPU input malloc: 0.772160 ms, 0
GPU output malloc: 0.041728 ms, 0
copy input from CPU to GPU: 1.494784 ms, 0
thread count: 32
block count: 64000
*** GPU kernel: 3.761216 ms, 0 ***
copy output from GPU to CPU: 1.203488 ms, 0
那段代码好吗? CPU比CUDA内核快〜0.1ms。我尝试了不同的线程数(32的乘数),最多512个,32个最快。运营商&amp;而不是&amp;&amp;差不多慢了1ms。
9800GT真的如此弱吗?今天的主流卡(即GTX460,560)可以带来什么加速?
谢谢
核心功能:
__global__ void logical_and(uchar4* in, uchar4* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
out[idx].x = in[idx].x & in[idx+N].x;
out[idx].y = in[idx].y & in[idx+N].y;
out[idx].z = in[idx].z & in[idx+N].z;
out[idx].w = in[idx].w & in[idx+N].w;
}
主要功能:
uchar4 *gpu, *gpures;
// 32 was worst, 64,128,256,512 were similar
dim3 dimBlock(128);
// block count is now 4xtimes smaller
dim3 dimGrid(512000/dimBlock.x);
输出:
GPU input malloc: 0.043360 ms, 0
GPU output malloc: 0.038592 ms, 0
copy input from CPU to GPU: 1.499584 ms, 0
thread count: 128
block count: 4000
*** GPU kernel: 0.131296 ms, 0 ***
copy output from GPU to CPU: 1.281120 ms, 0
这是对的吗?速度提升近30倍!这似乎太好了,但结果是正确的:) GTX560在这项特殊任务上的速度有多快? THX
这是代码吗
__global__ void logical_and(uchar4* in, uchar4* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
out[idx].x = in[idx].x & in[idx+N].x;
out[idx].y = in[idx].y & in[idx+N].y;
out[idx].z = in[idx].z & in[idx+N].z;
out[idx].w = in[idx].w & in[idx+N].w;
}
自动转换为:
__global__ void logical_and(uchar4* in, uchar4* out, int N) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
uchar4 buff;
buff.x = in[idx].x;
buff.y = in[idx].y;
buff.z = in[idx].z;
buff.w = in[idx].w;
buff.x &= in[idx+N].x;
buff.y &= in[idx+N].y;
buff.z &= in[idx+N].z;
buff.w &= in[idx+N].w;
out[idx].x = buff.x;
out[idx].y = buff.y;
out[idx].z = buff.z;
out[idx].w = buff.w;
}
编译器?
如果它是正确的,它解释了我对合并访问的困惑。我认为in[idx] & in[idx+N]
导致非合并访问,因为访问非连续内存。但事实上,in[idx]
和in[idx+N]
加载了两个合并的步骤。 N
可以是16的任意倍数,因为uchar4的长度为4个字节,并且对于合并的访问地址,必须与64个字节对齐(在1.1设备上)。我是对的吗?
答案 0 :(得分:2)
正如talonmies指出的那样,您正逐字节地访问和处理数据,这远非最佳。您可能需要考虑的一系列技术,如指令级并行和缓冲读/写,在瓦西里沃尔科夫的nVidia网络研讨会Better Performance at Lower Occupancy中进行了总结。
简而言之,您想要做的是,在每个线程中,以合并的方式阅读几个 uint4
,处理它们,然后才存储它们。
<强>更新强>
如果您按如下方式重新编写代码会有什么不同吗?
__global__ void logical_and(unsigned int* in, unsigned int* out, int N) {
int idx = blockIdx.x*blockDim.x*chunksize+threadIdx.x;
unsigned int buff[chunksize];
#pragma unroll
for ( int k = 0 ; k < chunksize ; k++ )
buff[k] = in[ blockDim.x*k + idx ];
#pragma unroll
for ( int k = 0 ; k < chunksize ; k++ )
buff[k] &= in[ blockDim.x*k + idx + N ];
#pragma unroll
for ( int k = 0 ; k < chunksize ; k++ )
out[ blockDim.x*k + idx ] = buff[k];
}
请注意,我认为chunksize
是某个地方的#define
变量,例如。
#define chunksize 4
并且您必须将启动的块数和N
除以该数字。我还使用了unsigned int
,这只是四个打包uchar
。在你的调用函数中,你可能需要相应地转换指针。
答案 1 :(得分:1)
我认为它的发生称为false sharing。我认为问题在于,您尝试从线程写入的字节大小的区域正在产生大量竞争条件,因为不同的线程正在尝试写入相同的字对齐地址。我不确定GPU中的细节,但是在CPU中,当不同的线程尝试在相同的256字节对齐区域(称为缓存线)中写入内存时,它们将不断地相互阻塞,从而导致全局性能下降。