我正在用c ++创建一个并行版的Eratosthenes Sieve。问题是我的内核调用(reduce0)似乎只为每个块分配8个线程而不是我指定的256个线程。因为即使第一个CUDA版本允许每个块512个线程,我的代码中也必须有一些错误。任何帮助将不胜感激。
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;
////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);
////////////////////////////////////////////////////
int main(){
int n = pow((double) 2, 8);
int total = sieve(n);
cout << "# primes" << endl << total << endl;
return 0;
}
///////////////////////////////////////////////////
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////////////////////////////////
int call_kernel(int *primes, int n){
// Allocate and copy device arrays
int *g_idevice;
int *g_odevice;
int size = n * sizeof(int);
cudaMalloc(&g_idevice, size);
cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
cudaMalloc(&g_odevice, size);
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
// Copy device data back to primes
cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
//for (int i = 0; i < n; i++) {
// cout << i << " " << primes[i] << endl;
//}
int total = primes[0];
cudaFree(g_idevice);
cudaFree(g_odevice);
return total;
}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
int i = f;
while(arg[i]!= 1 && i < n) {
i++;
}
return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
int total = 0;
int i = 2;
while(i < n){
if(arg[i] == 1){
total = total + 1;
}
i++;
}
return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
int* primes = NULL;
int mult = 0;
int k = 2;
int i; int total;
//primes = new int[n];
primes = new int[256];
for(i = 0; i < n; i++){
primes[i] = 1;
}
primes[0] = primes[1] = 0;
while (k * k < n){
mult = k * k;
while (mult < n) {
primes[mult] = 0;
mult = mult + k;
}
k = findsmallest(primes,k+1, n);
}
total = call_kernel(primes, n);
//delete [] primes;
//primes = NULL;
return total;
}
答案 0 :(得分:2)
您的内核使用动态分配的共享内存,但内核启动不包含任何分配,因此结果是内核将因为共享内存缓冲区上的非法内存操作而中止。如果您按如下方式修改call_kernel的这一部分,您应该会发现它有效:
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);
如果你在函数调用中包含了一些基本的错误检查,可能是这样的:
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}
// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}
很明显,内核启动或执行失败并出现错误。