Question

我是CUDA的新手，我可能做错了什么。我只需要对两个二进制向量进行逻辑运算。向量长度为2048000.我比较了Matlab的C mex文件和CUDA内核中逻辑and之间的速度。 CPU上的C比CUDA快约5％。请注意，我只测量了内核执行（没有内存传输）。我有i7 930和9800GT。

##MEX file testCPU.c:##

#include "mex.h"
void mexFunction( int nlhs, mxArray *plhs[],
        int nrhs, const mxArray *prhs[] ) {

    int i, varLen;
    unsigned char *vars, *output;

    vars = mxGetPr(prhs[0]);
    plhs[0] = mxCreateLogicalMatrix(2048000, 1);
    output = mxGetPr(plhs[0]);
    for (i=0;i<2048000;i++){
        output[i] = vars[i] & vars[2048000+i];
    }
}

编译

mex testCPU.c

创建向量

vars = ~~(randi(2,2048000,2)-1);

测量速度：

tic;testCPU(vars);toc;

CUDA ：

#CUDA file testGPU.cu#
#include "mex.h"
#include "cuda.h"

__global__ void logical_and(unsigned char* in, unsigned char* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;
    out[idx] = in[idx] && in[idx+N];
}


void mexFunction( int nlhs, mxArray *plhs[],
        int nrhs, const mxArray *prhs[] ) {

    int i;
    unsigned char *vars, *output, *gpu, *gpures;

    vars = (unsigned char*)mxGetData(prhs[0]);

    plhs[0] = mxCreateLogicalMatrix(2048000, 1);
    output = (unsigned char*)mxGetData(plhs[0]);       

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float dt_ms;

    // input GPU malloc
    cudaEventRecord(start, 0);
    cudaMalloc( (void **) &gpu, sizeof(unsigned char)*4096000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU input malloc: %f ms, %i\n", dt_ms, cudaGetLastError());

    // output GPU malloc
    cudaEventRecord(start, 0);
    cudaMalloc( (void **) &gpures, sizeof(unsigned char)*2048000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU output malloc: %f ms, %i\n", dt_ms, cudaGetLastError());

    // copy from CPU to GPU
    cudaEventRecord(start, 0);
    cudaMemcpy( gpu, vars, sizeof(unsigned char)*4096000, cudaMemcpyHostToDevice);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("copy input from CPU to GPU: %f ms, %i\n", dt_ms, cudaGetLastError());

    dim3 dimBlock(32);
    printf("thread count: %i\n", dimBlock.x);
    dim3 dimGrid(2048000/dimBlock.x);
    printf("block count: %i\n", dimGrid.x);

    // --- KERNEL ---
    cudaEventRecord(start, 0);
    logical_and<<<dimGrid, dimBlock>>>(gpu, gpures, 2048000);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("GPU kernel: %f ms, %i\n", dt_ms, cudaGetLastError());

    // result from GPU to CPU
    cudaEventRecord(start, 0);
    cudaMemcpy( output, gpures, sizeof(unsigned char)*2048000, cudaMemcpyDeviceToHost );
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&dt_ms, start, stop);
    printf("copy output from GPU to CPU: %f ms, %i\n", dt_ms, cudaGetLastError());


    cudaFree(gpu);
    cudaFree(gpures);

}

编译：

 nvmex -f nvmexopts_9.bat testGPU.cu 
-I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\include" 
-L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\lib\x64" -lcudart -lcufft

输出：

GPU input malloc: 0.772160 ms, 0
GPU output malloc: 0.041728 ms, 0
copy input from CPU to GPU: 1.494784 ms, 0
thread count: 32
block count: 64000
*** GPU kernel: 3.761216 ms, 0 ***
copy output from GPU to CPU: 1.203488 ms, 0

那段代码好吗？ CPU比CUDA内核快〜0.1ms。我尝试了不同的线程数（32的乘数），最多512个，32个最快。运营商＆amp;而不是＆amp;＆amp;差不多慢了1ms。

9800GT真的如此弱吗？今天的主流卡（即GTX460,560）可以带来什么加速？

谢谢

编辑：基于talonmies的评论，我做了这些修改：

核心功能：

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;
    out[idx].x = in[idx].x & in[idx+N].x;
    out[idx].y = in[idx].y & in[idx+N].y;
    out[idx].z = in[idx].z & in[idx+N].z;
    out[idx].w = in[idx].w & in[idx+N].w;
}

主要功能：

uchar4 *gpu, *gpures;

// 32 was worst, 64,128,256,512 were similar
dim3 dimBlock(128);
// block count is now 4xtimes smaller
dim3 dimGrid(512000/dimBlock.x);

输出：

GPU input malloc: 0.043360 ms, 0
GPU output malloc: 0.038592 ms, 0
copy input from CPU to GPU: 1.499584 ms, 0
thread count: 128
block count: 4000
*** GPU kernel: 0.131296 ms, 0 ***
copy output from GPU to CPU: 1.281120 ms, 0

这是对的吗？速度提升近30倍！这似乎太好了，但结果是正确的:) GTX560在这项特殊任务上的速度有多快？ THX

编辑2：

这是代码吗

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;

    out[idx].x = in[idx].x & in[idx+N].x;
    out[idx].y = in[idx].y & in[idx+N].y;
    out[idx].z = in[idx].z & in[idx+N].z;
    out[idx].w = in[idx].w & in[idx+N].w;
}

自动转换为：

__global__ void logical_and(uchar4* in, uchar4* out, int N) {
    int idx = blockIdx.x*blockDim.x+threadIdx.x;  
    uchar4 buff;

    buff.x = in[idx].x;
    buff.y = in[idx].y;
    buff.z = in[idx].z;
    buff.w = in[idx].w;

    buff.x &= in[idx+N].x;
    buff.y &= in[idx+N].y;
    buff.z &= in[idx+N].z;
    buff.w &= in[idx+N].w;

    out[idx].x = buff.x;
    out[idx].y = buff.y;
    out[idx].z = buff.z;
    out[idx].w = buff.w;
}

编译器？

如果它是正确的，它解释了我对合并访问的困惑。我认为in[idx] & in[idx+N]导致非合并访问，因为访问非连续内存。但事实上，in[idx]和in[idx+N]加载了两个合并的步骤。 N可以是16的任意倍数，因为uchar4的长度为4个字节，并且对于合并的访问地址，必须与64个字节对齐（在1.1设备上）。我是对的吗？

Answer 1

正如talonmies指出的那样，您正逐字节地访问和处理数据，这远非最佳。您可能需要考虑的一系列技术，如指令级并行和缓冲读/写，在瓦西里沃尔科夫的nVidia网络研讨会Better Performance at Lower Occupancy中进行了总结。

简而言之，您想要做的是，在每个线程中，以合并的方式阅读几个 uint4，处理它们，然后才存储它们。

<强>更新

如果您按如下方式重新编写代码会有什么不同吗？

__global__ void logical_and(unsigned int* in, unsigned int* out, int N) {
    int idx = blockIdx.x*blockDim.x*chunksize+threadIdx.x;
    unsigned int buff[chunksize];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        buff[k] = in[ blockDim.x*k + idx ];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        buff[k] &= in[ blockDim.x*k + idx + N ];
    #pragma unroll
    for ( int k = 0 ; k < chunksize ; k++ )
        out[ blockDim.x*k + idx ] = buff[k];
}

请注意，我认为chunksize是某个地方的#define变量，例如。

#define chunksize 4

并且您必须将启动的块数和N除以该数字。我还使用了unsigned int，这只是四个打包uchar。在你的调用函数中，你可能需要相应地转换指针。

Answer 2

我认为它的发生称为false sharing。我认为问题在于，您尝试从线程写入的字节大小的区域正在产生大量竞争条件，因为不同的线程正在尝试写入相同的字对齐地址。我不确定GPU中的细节，但是在CPU中，当不同的线程尝试在相同的256字节对齐区域（称为缓存线）中写入内存时，它们将不断地相互阻塞，从而导致全局性能下降。

CUDA内核比CPU慢

编辑：基于talonmies的评论，我做了这些修改：

编辑2：

2 个答案: