Question

我正在尝试在CUDA中为任意大小的1-D数组实现前缀和。我想弄清楚的第一步是将数组拆分成多个块。

在我的第一个实验中，我有一个大小为8的数组。我使用https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html中的示例39-2中提供的并行算法为每个指定了2个块和4个线程

我的想法是

我将数组拆分为两个块，每四个线程将处理每个块中的四个元素（“块”）。
然后我希望分别从两个块中获得两个前缀和结果。
然后我可以将第一个“chunk”中前缀sum输出的最后一个元素添加到第二个“chunk”中的所有元素。并得到最终答案。

现在我陷入了第2步。我无法分别从两个块中得到正确的结果。有谁能请给我一些更正？谢谢！

以下是我的代码：

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#define N 8
#define thread_num 4
#define block_num 2


__global__ void prescan(float *g_odata, float *g_idata, int n);
void scanCPU(float *f_out, float *f_in, int i_n);

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end - d_start);
}

int main()
{
float a[N], c[N], g[N];
timeval start, end;

float *dev_a, *dev_g;
int size = N * sizeof(float);

double d_gpuTime, d_cpuTime;

// initialize matrices a
for (int i = 0; i < N; i++)
{
//        a[i] = (float)(rand() % 1000000) / 1000.0;
    a[i] = i+1;
    printf("a[%i] = %f\n", i, a[i]);
}
// initialize a and b matrices here
cudaMalloc((void **) &dev_a, size);
cudaMalloc((void **) &dev_g, size);

gettimeofday(&start, NULL);

cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);

prescan<<<block_num,thread_num,2*N*sizeof(float)>>>(dev_g, dev_a, N);
cudaDeviceSynchronize();

cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);

gettimeofday(&end, NULL);
d_gpuTime = myDiffTime(start, end);

gettimeofday(&start, NULL);
scanCPU(c, a, N);

gettimeofday(&end, NULL);
d_cpuTime = myDiffTime(start, end);

cudaFree(dev_a); cudaFree(dev_g);

for (int i = 0; i < N; i++)
{
    printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]);
}

printf("GPU Time for scan size %i: %f\n", N, d_gpuTime);
printf("CPU Time for scan size %i: %f\n", N, d_cpuTime);
}


__global__ void prescan(float *g_odata, float *g_idata, int n)
{
extern  __shared__  float temp[];
// allocated on invocation
int thid = threadIdx.x;
int bid = blockIdx.x;
int fake_n = block_num * thread_num;


int offset = 1;
if(bid * thread_num + thid<n){ temp[bid * thread_num + thid]  = g_idata[bid * thread_num + thid];
}else{ temp[bid * thread_num + thid]  = 0;
} // Make the "empty" spots zeros, so it won't affect the final result.

for (int d = thread_num>>1; d > 0; d >>= 1)
    // build sum in place up the tree
{
    __syncthreads();
    if (thid < d)
    {
        int ai = bid * thread_num + offset*(2*thid+1)-1;
        int bi = bid * thread_num + offset*(2*thid+2)-1;
        temp[bi] += temp[ai];
    }
    offset *= 2;
}

if (thid == 0)
{
    temp[thread_num - 1] = 0;
}

// clear the last element
for (int d = 1; d < thread_num; d *= 2)
    // traverse down tree & build scan
{
    offset >>= 1;
    __syncthreads();
    if (thid < d)
    {
        int ai = bid * thread_num + offset*(2*thid+1)-1;
        int bi = bid * thread_num + offset*(2*thid+2)-1;
        float t = temp[bid * thread_num + ai];
        temp[ai]  = temp[ bi];
        temp[bi] += t;
    }
}
__syncthreads();

g_odata[bid * thread_num + thid] = temp[bid * thread_num + thid];
}

void scanCPU(float *f_out, float *f_in, int i_n)
{
f_out[0] = 0;
for (int i = 1; i < i_n; i++)
    f_out[i] = f_out[i-1] + f_in[i-1];

}

Answer 1

您所犯的基本错误是认为您要创建一个共享内存数组，该数组是要扫描的数组的完整大小。

相反，当我们将工作分解为线程块时，我们只创建一个足够大的每个线程块大小的共享内存数组。你很可能对共享内存有一些误解。从逻辑上讲，它是每线程块资源。每个线程块都有自己独立的逻辑共享内存阵列。此外，共享内存仅限于每个SM可支持的大小，或逻辑线程块限制（通常为48KB）。因此，尝试分配与数据阵列大小相同的共享内存阵列无论如何都不会起作用，因为您的实际问题大小会变得很大＆＃34;：

prescan<<<block_num,thread_num,2*N*sizeof(float)>>>(dev_g, dev_a, N);
                                 ^

相反，通常的方法是为每个线程块分配足够的空间，在这种情况下，这是由每个块的线程数决定的：

prescan<<<block_num,thread_num,2*thread_num*sizeof(float)>>>(dev_g, dev_a, N);

对应于此更改，我们必须更改内核代码中的索引，以便全局内存访问使用完整数组索引，但共享内存访问使用＆＃34; local＆＃34; index，由共享内存数组的大小决定。

这实际上很简单;我们需要从内核代码中删除正在使用或创建共享内存（bid*thread_num）索引的每个位置的所有temp构造。这样，一旦我们将数据存储在共享内存中，每个线程块的行为相同，就其自己的共享内存副本而言。

这是一个有效的例子：

$ cat t80.cu
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#define N 8
#define thread_num 4
#define block_num 2


__global__ void prescan(float *g_odata, float *g_idata, int n);
void scanCPU(float *f_out, float *f_in, int i_n);

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end - d_start);
}

int main()
{
float a[N], c[N], g[N];
timeval start, end;

float *dev_a, *dev_g;
int size = N * sizeof(float);

double d_gpuTime, d_cpuTime;

// initialize matrices a
for (int i = 0; i < N; i++)
{
//        a[i] = (float)(rand() % 1000000) / 1000.0;
    a[i] = i+1;
    printf("a[%i] = %f\n", i, a[i]);
}
// initialize a and b matrices here
cudaMalloc((void **) &dev_a, size);
cudaMalloc((void **) &dev_g, size);

gettimeofday(&start, NULL);

cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);

prescan<<<block_num,thread_num,2*thread_num*sizeof(float)>>>(dev_g, dev_a, N);
cudaDeviceSynchronize();

cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);

gettimeofday(&end, NULL);
d_gpuTime = myDiffTime(start, end);

gettimeofday(&start, NULL);
scanCPU(c, a, N);

gettimeofday(&end, NULL);
d_cpuTime = myDiffTime(start, end);

cudaFree(dev_a); cudaFree(dev_g);

for (int i = 0; i < N; i++)
{
    printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]);
}

printf("GPU Time for scan size %i: %f\n", N, d_gpuTime);
printf("CPU Time for scan size %i: %f\n", N, d_cpuTime);
}


__global__ void prescan(float *g_odata, float *g_idata, int n)
{
extern  __shared__  float temp[];
// allocated on invocation
int thid = threadIdx.x;
int bid = blockIdx.x;


int offset = 1;
if((bid * thread_num + thid)<n){ temp[thid]  = g_idata[bid * thread_num + thid];
}else{ temp[thid]  = 0;
} // Make the "empty" spots zeros, so it won't affect the final result.

for (int d = thread_num>>1; d > 0; d >>= 1)
    // build sum in place up the tree
{
    __syncthreads();
    if (thid < d)
    {
        int ai = offset*(2*thid+1)-1;
        int bi = offset*(2*thid+2)-1;
        temp[bi] += temp[ai];
    }
    offset *= 2;
}

if (thid == 0)
{
    temp[thread_num - 1] = 0;
}

// clear the last element
for (int d = 1; d < thread_num; d *= 2)
    // traverse down tree & build scan
{
    offset >>= 1;
    __syncthreads();
    if (thid < d)
    {
        int ai = offset*(2*thid+1)-1;
        int bi = offset*(2*thid+2)-1;
        float t = temp[ai];
        temp[ai]  = temp[ bi];
        temp[bi] += t;
    }
}
__syncthreads();

g_odata[bid * thread_num + thid] = temp[thid];
}

void scanCPU(float *f_out, float *f_in, int i_n)
{
f_out[0] = 0;
for (int i = 1; i < i_n; i++)
    f_out[i] = f_out[i-1] + f_in[i-1];

}
$ nvcc -arch=sm_60 -o t80 t80.cu
$ cuda-memcheck ./t80
========= CUDA-MEMCHECK
a[0] = 1.000000
a[1] = 2.000000
a[2] = 3.000000
a[3] = 4.000000
a[4] = 5.000000
a[5] = 6.000000
a[6] = 7.000000
a[7] = 8.000000
c[0] = 0.000, g[0] = 0.000
c[1] = 1.000, g[1] = 1.000
c[2] = 3.000, g[2] = 3.000
c[3] = 6.000, g[3] = 6.000
c[4] = 10.000, g[4] = 0.000
c[5] = 15.000, g[5] = 5.000
c[6] = 21.000, g[6] = 11.000
c[7] = 28.000, g[7] = 18.000
GPU Time for scan size 8: 0.004464
CPU Time for scan size 8: 0.000000
========= ERROR SUMMARY: 0 errors
$

请注意，无论我们访问全局内存数组的哪个位置，我们都使用一个索引来从全局内存数组中选择一个块大小的块。但无论我们在哪里索引到共享内存，我们都会使用为单个块大小的块创建的索引。

使用CUDA的任意大小的前缀和

1 个答案: