Question

我正在尝试制作一个按照cuda设计计算的分数计算器，下面首先是顺序版本，然后我尝试并行版本。它运行没有错误，但由于某种原因，它没有给出结果，我一直试图让它工作2周，但找不到错误！

服务版

int f(int x, int c, int n);
int gcd(unsigned int u, unsigned int v);

int main ()
{
    clock_t start = clock();

    srand ( time(NULL) );

    int x = 1;
    int y = 2;
    int d = 1;


    int c = rand() % 100;
    int n = 323;

    if(n % y == 0)
        d = y;

    while(d == 1)
    {
        x = f(x, c, n);
        y = f(f(y, c, n), c, n);
        int abs = x - y;
        if(abs < 0)
            abs = abs * -1;
        d = gcd(abs, n);

        if(d == n)
        {
            printf("\nd == n");
            c = 0;
            while(c == 0 || c == -2)
                c = rand() % 100;   
            x = 2;
            y = 2;
        }
    }

    int d2 = n/d;

    printf("\nTime elapsed: %f", ((double)clock() - start) / CLOCKS_PER_SEC);
    printf("\nResult: %d", d);
    printf("\nResult2: %d", d2);


    int dummyReadForPause;
    scanf_s("%d",&dummyReadForPause);
}

int f(int x, int c, int n)
{
    return (int)(pow((float)x, 2) + c) % n;
}

int gcd(unsigned int u, unsigned int v){

    int shift;

     / * GCD(0,x) := x * /
     if (u == 0 || v == 0)
       return u | v;

     / * Let shift := lg K, where K is the greatest power of 2
        dividing both u and v. * /
     for (shift = 0; ((u | v) & 1) == 0; ++shift) {
         u >>= 1;
         v >>= 1;
     }

     while ((u & 1) == 0)
       u >>= 1;

     / * From here on, u is always odd. * /
     do {
         while ((v & 1) == 0)  / * Loop X * /
           v >>= 1;

         / * Now u and v are both odd, so diff(u, v) is even.
            Let u = min(u, v), v = diff(u, v)/2. * /
         if (u < v) {
             v -= u;
         } else {
             int diff = u - v;
             u = v;
             v = diff;
         }
         v >>= 1;
     } while (v != 0);

     return u << shift;
}

并行版

#define threads 512
#define MaxBlocks 65535
#define RunningTheads (512*100)

__device__ int gcd(unsigned int u, unsigned int v)
{
    int shift;
    if (u == 0 || v == 0)
        return u | v;

    for (shift = 0; ((u | v) & 1) == 0; ++shift) {
        u >>= 1;
        v >>= 1;
    }

    while ((u & 1) == 0)
        u >>= 1;

    do {
        while ((v & 1) == 0)
            v >>= 1;

        if (u < v) {
            v -= u;
        } else {
            int diff = u - v;
            u = v;
            v = diff;
        }
        v >>= 1;
    } while (v != 0);

    return u << shift;
}

__device__ bool cuda_found;
__global__ void cudaKernal(int *cArray, int n, int *outr)
{
    int index = blockIdx.x * threads + threadIdx.x;

    int x = 1;
    int y = 2;
    int d = 4;
    int c = cArray[index];

    while(d == 1 && !cuda_found)
    {
        x = (int)(pow((float)x, 2) + c) % n;
        y = (int)(pow((float)y, 2) + c) % n;
        y = (int)(pow((float)y, 2) + c) % n;

        int abs = x - y;
        if(abs < 0)
            abs = abs * -1;
        d = gcd(abs, n);
    }
    if(d != 1 && !cuda_found)
    {
        cuda_found = true;
        outr = &d;
    }
}

int main ()
{
    int n = 323;

    int cArray[RunningTheads];  
    cArray[0] = 1;
    for(int i = 1; i < RunningTheads-1; i++)
    {
        cArray[i] = i+2;
    }

    int dresult = 0;
    int *dev_cArray;
    int *dev_result;

    HANDLE_ERROR(cudaMalloc((void**)&dev_cArray, RunningTheads*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&dev_result, sizeof(int)));

    HANDLE_ERROR(cudaMemcpy(dev_cArray, cArray, RunningTheads*sizeof(int), cudaMemcpyHostToDevice));

    int TotalBlocks = ceil((float)RunningTheads/(float)threads);
    if(TotalBlocks > MaxBlocks)
        TotalBlocks = MaxBlocks;

    printf("Blocks: %d\n", TotalBlocks);
    printf("Threads: %d\n\n", threads);

    cudaKernal<<<TotalBlocks,threads>>>(dev_cArray, n, dev_result);

    HANDLE_ERROR(cudaMemcpy(&dresult, dev_result, sizeof(int), cudaMemcpyDeviceToHost));

    HANDLE_ERROR(cudaFree(dev_cArray));
    HANDLE_ERROR(cudaFree(dev_result));

    if(dresult == 0)
        dresult = 1;

    int d2 = n/dresult;

    printf("\nResult: %d", dresult);
    printf("\nResult2: %d", d2);


    int dummyReadForPause;
    scanf_s("%d",&dummyReadForPause);
}

Answer 1

让我们看一下你的内核代码：

__global__ void cudaKernal(int *cArray, int n, int *outr)
{
    int index = blockIdx.x * threads + threadIdx.x;

    int x = 1;
    int y = 2;
    int d = 4;
    int c = cArray[index];

    while(d == 1 && !cuda_found)     // always false because d is always 4
    {
        x = (int)(pow((float)x, 2) + c) % n;
        y = (int)(pow((float)y, 2) + c) % n;
        y = (int)(pow((float)y, 2) + c) % n;

        int abs = x - y;
        if(abs < 0)
            abs = abs * -1;
        d = gcd(abs, n);            // never writes to d because the loop won't 
                                    // be executed
    }
    if(d != 1 && !cuda_found)       // maybe true if cuda_found was initalized 
                                    // with false
    {
        cuda_found = true;   // Memory race here.
        outr = &d;           // you are changing the adresse where outr 
                             // points to; the host code does not see this  
                             // change. your cudaMemcpy dev -> host will copy 
                             // the exact values back from device that have 
                             // been uploaded by cudaMemcpy host -> dev
                             // if you want to set outr to 4 than write:
                             // *outr = d;
        }
    }

Answer 2

其中一个问题是你没有返回结果。在您的代码中，您只需更改内核函数中具有局部作用域的outr（即在此函数之外不会看到更改）。您应该编写*outr = d;来更改您使用outr指向的内存值。

并且我不确定CUDA是否将全局变量初始化为零。我的意思是，您确定始终使用cuda_found初始化false吗？

CUDA没有返回结果

2 个答案: