我最近问过this question我写的一些CUDA代码来生成素数。令我惊讶的是@talonmies说我的代码在运行时运行正常。考虑到你没有得到5k代表Rick Rolling的人我相信它。但事实仍然是它在我的机器上不起作用。
对可能导致此类行为,补救措施或仅在您的计算机上运行此代码的任何见解,以便我可以获得超过2个结果样本将会膨胀。
谢谢!
P.S。这里是有问题的代码::
#include <stdio.h>
#include <stdlib.h>
typedef struct{
int num;
int count;
} counterClass;
counterClass new_counterClass(counterClass aCounter, int by, int count){
aCounter.num = by;
aCounter.count = count%by;
return aCounter;
}
__global__ void count(counterClass *Counters){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
Counters[idx].count+=1;
if(Counters[idx].count == Counters[idx].num){
Counters[idx].count = 0;
}
__syncthreads();
}
__global__ void check(counterClass *Counters, bool *result){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (Counters[idx].count == 0){
*result = false;
}
__syncthreads();
}
int main(){
int tPrimes = 5; // Total Primes to Find
int nPrimes = 1; // Number of Primes Found
bool *d_result, h_result=true;
counterClass *h_counters =(counterClass *)malloc(tPrimes*sizeof(counterClass));
h_counters[0]=new_counterClass(h_counters[0], 2 , 0);
counterClass *d_counters;
int n = 2;
cudaMalloc((void **)&d_counters, tPrimes*sizeof(counterClass));
cudaMalloc((void **)&d_result, sizeof(bool));
cudaMemcpy(d_counters, h_counters, tPrimes*sizeof(counterClass), cudaMemcpyHostToDevice);
while(nPrimes<tPrimes){
h_result=true;
cudaMemcpy(d_result, &h_result, sizeof(bool), cudaMemcpyHostToDevice);
n+=1;
count<<<1,nPrimes>>>(d_counters);
check<<<1,nPrimes>>>(d_counters,d_result);
cudaMemcpy(&h_result, d_result, sizeof(bool), cudaMemcpyDeviceToHost);
if(h_result){
printf("%d\n", n);
cudaMemcpy(h_counters, d_counters, tPrimes*sizeof(counterClass), cudaMemcpyDeviceToHost);
h_counters[nPrimes]=new_counterClass(h_counters[nPrimes], n , 0);
nPrimes += 1;
cudaMemcpy(d_counters, h_counters, tPrimes*sizeof(counterClass), cudaMemcpyHostToDevice);
}
}
}
我的结果::
$ nvcc parraPrimes.cu -o primes
$ ./primes
3
4
5
6
而对于@talonmies代码打印3,5,7,11 ... wtf对吗?