Question

我正在用c ++创建一个并行版的Eratosthenes Sieve。问题是我的内核调用（reduce0）似乎只为每个块分配8个线程而不是我指定的256个线程。因为即使第一个CUDA版本允许每个块512个线程，我的代码中也必须有一些错误。任何帮助将不胜感激。

#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;

////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);

////////////////////////////////////////////////////
int main(){
    int n = pow((double) 2, 8);
    int total = sieve(n);
    cout << "# primes" << endl << total << endl;
    return 0;
}
///////////////////////////////////////////////////

__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];

// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();

// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
    if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
        sdata[tid] += sdata[tid + s];
    }
    __syncthreads();
}

// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

/////////////////////////////////////////////////////

int call_kernel(int *primes, int n){
    // Allocate and copy device arrays
    int *g_idevice;
    int *g_odevice;
    int size = n * sizeof(int);
    cudaMalloc(&g_idevice, size);
    cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
    cudaMalloc(&g_odevice, size);

    // Specify grid/block dimenstions and invoke the kernel
    dim3 dimGrid(1,1);
    dim3 dimBlock(256,1);
    reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);

    // Copy device data back to primes
    cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);

    //for (int i = 0; i < n; i++) {
    //  cout << i << "  " << primes[i] << endl;
    //}
    int total = primes[0];
    cudaFree(g_idevice);
    cudaFree(g_odevice);
    return total;


}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
    int i = f;
    while(arg[i]!= 1 && i < n) {
        i++;
    }
    return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
    int total = 0;
    int i = 2;
    while(i < n){
        if(arg[i] == 1){
        total = total + 1;
        }
        i++;
    }
    return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
    int* primes = NULL;
    int mult = 0;
    int k = 2;
    int i; int total;
    //primes = new int[n];
    primes = new int[256];
    for(i = 0; i < n; i++){
        primes[i] = 1;
    }
    primes[0] = primes[1] = 0;

    while (k * k < n){
        mult = k * k;
        while (mult < n) {
            primes[mult] = 0;
            mult =  mult + k;
        }
        k = findsmallest(primes,k+1, n);
    }
    total = call_kernel(primes, n);
    //delete [] primes;
    //primes = NULL;
    return total;
}

Answer 1

您的内核使用动态分配的共享内存，但内核启动不包含任何分配，因此结果是内核将因为共享内存缓冲区上的非法内存操作而中止。如果您按如下方式修改call_kernel的这一部分，您应该会发现它有效：

// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);

如果你在函数调用中包含了一些基本的错误检查，可能是这样的：

reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
    cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}

// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
    cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}

很明显，内核启动或执行失败并出现错误。

无论规范如何，Cuda调用都不会为每个块分配超过8个线程

1 个答案: