如何优化Eratosthenes的CUDA筛选

时间:2013-03-25 18:34:32

标签: c cuda sieve-of-eratosthenes

我是CUDA的新手。为了弄清楚我的手,我尝试写了一个Eratosthenes筛子(找到所有素数到达某个数字n)。

为了让它发挥作用,我必须做很多事情似乎不应该是必要的。我很好奇是否有人知道更自然(并且仍然是CUDA优化)的方法。

  1. 要获取isPrime数组中标记为prime的条目,我必须进行两次单独的内核调用。第一个计算每个线程块中的素数的数量,并为每个条目i分配该块中的素数小于i。然后我必须再次调用以添加所有先前块中的素数数量,以获得最终索引。
  2. 但它甚至比这更糟糕,因为为了避免大量的并发读取,我必须在每个THREADS_PER_BLOCK索引的单独数组中存储块中的素数,从而有效地使算法所需的内存加倍。似乎应该有一种方法让所有线程为每个块读取相同的值,而不是必须复制它多次。
  3. 尽管如此,仍然存在clearMultiples方法中并发读取的问题。特别是对于像2和3这样的小素数,每个线程都必须读取值。是否有任何方法可以解决这个问题?
  4. 任何人都可以查看我的代码并告诉我,如果有任何明显的事情可以做得更简单或更有效吗?

    有什么我正在做的那样效率特别低(除了在课程结束时打印出所有素数)?

    是否有必要在每次内核调用后调用synchronize?

    我还需要在memcpy之后同步吗?

    最后,为什么当我将THREADS_PER_BLOCK设置为512时它不起作用?

    谢谢

    #include <stdio.h>
    #include <cuda.h>
    #include <assert.h>
    #include <math.h>
    
    #define MAX_BLOCKS 256
    #define THREADS_PER_BLOCK 256 //Must be a power of 2
    #define BLOCK_SPACE 2 * THREADS_PER_BLOCK
    
    __global__ void initialize(int* isPrime, int n) {
        int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
        int step = gridDim.x * THREADS_PER_BLOCK;
        int i;
        for (i = idx; i <= 1; i += step) {
            isPrime[i] = 0;
        }
        for (; i < n; i += step) {
            isPrime[i] = 1;
        }
    }
    
    __global__ void clearMultiples(int* isPrime, int* primeList, int startInd,
            int endInd, int n) {
        int yidx = blockIdx.y * blockDim.y + threadIdx.y;
        int xidx = blockIdx.x * blockDim.x + threadIdx.x;
        int ystep = gridDim.y * blockDim.y;
        int xstep = gridDim.x * blockDim.x;
        for (int pnum = startInd + yidx; pnum < endInd; pnum += ystep) {
            int p = primeList[pnum];
            int pstart = p * (p + xidx);
            int pstep = p * xstep;
            for (int i = pstart; i < n; i += pstep) {
                isPrime[i] = 0;
            }
        }
    }
    
    __device__ void makeCounts(int* isPrime, int* addend, int start, int stop) {
        __shared__ int tmpCounts[BLOCK_SPACE];
        __shared__ int dumbCounts[BLOCK_SPACE];
        int idx = threadIdx.x;
        tmpCounts[idx] = ((start + idx) < stop) ? isPrime[start + idx] : 0;
        __syncthreads();
        int numEntries = THREADS_PER_BLOCK;
        int cstart = 0;
        while (numEntries > 1) {
            int prevStart = cstart;
            cstart += numEntries;
            numEntries /= 2;
            if (idx < numEntries) {
                int i1 = idx * 2 + prevStart;
                tmpCounts[idx + cstart] = tmpCounts[i1] + tmpCounts[i1 + 1];
            }
            __syncthreads();
        }
        if (idx == 0) {
            dumbCounts[cstart] = tmpCounts[cstart];
            tmpCounts[cstart] = 0;
        }
        while (cstart > 0) {
            int prevStart = cstart;
            cstart -= numEntries * 2;
            if (idx < numEntries) {
                int v1 = tmpCounts[idx + prevStart];
                int i1 = idx * 2 + cstart;
                tmpCounts[i1 + 1] = tmpCounts[i1] + v1;
                tmpCounts[i1] = v1;
                dumbCounts[i1] = dumbCounts[i1 + 1] = dumbCounts[idx + prevStart];
            }
            numEntries *= 2;
            __syncthreads();
        }
        if (start + idx < stop) {
            isPrime[start + idx] = tmpCounts[idx];
            addend[start + idx] = dumbCounts[idx];
        }
    }
    
    __global__ void createCounts(int* isPrime, int* addend, int lb, int ub) {
        int step = gridDim.x * THREADS_PER_BLOCK;
        for (int i = lb + blockIdx.x * THREADS_PER_BLOCK; i < ub; i += step) {
            int start = i;
            int stop = min(i + step, ub);
            makeCounts(isPrime, addend, start, stop);
        }
    }
    
    __global__ void sumCounts(int* isPrime, int* addend, int lb, int ub,
            int* totalsum) {
        int idx = blockIdx.x;
        int s = 0;
        for (int i = lb + idx; i < ub; i += THREADS_PER_BLOCK) {
            isPrime[i] += s;
            s += addend[i];
        }
        if (idx == 0) {
            *totalsum = s;
        }
    }
    
    __global__ void condensePrimes(int* isPrime, int* primeList, int lb, int ub,
            int primeStartInd, int primeCount) {
        int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
        int step = gridDim.x * THREADS_PER_BLOCK;
        for (int i = lb + idx; i < ub; i += step) {
            int term = isPrime[i];
            int nextTerm = i + 1 == ub ? primeCount : isPrime[i + 1];
            if (term < nextTerm) {
                primeList[primeStartInd + term] = i;
            }
        }
    }
    
    int main(void) {
        printf("Enter upper bound:\n");
        int n;
        scanf("%d", &n);
    
        int *isPrime, *addend, *numPrimes, *primeList;
        cudaError_t t = cudaMalloc((void**) &isPrime, n * sizeof(int));
        assert(t == cudaSuccess);
        t = cudaMalloc(&addend, n * sizeof(int));
        assert(t == cudaSuccess);
        t = cudaMalloc(&numPrimes, sizeof(int));
        assert(t == cudaSuccess);
        int primeBound = 2 * n / log(n);
        t = cudaMalloc(&primeList, primeBound * sizeof(int));
        assert(t == cudaSuccess);
        int numBlocks = min(MAX_BLOCKS,
                (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
        initialize<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, n);
        t = cudaDeviceSynchronize();
        assert(t == cudaSuccess);
    
        int bound = (int) ceil(sqrt(n));
    
        int lb;
        int ub = 2;
        int primeStartInd = 0;
        int primeEndInd = 0;
    
        while (ub < n) {
            if (primeEndInd > primeStartInd) {
                int lowprime;
                t = cudaMemcpy(&lowprime, primeList + primeStartInd, sizeof(int),
                        cudaMemcpyDeviceToHost);
                assert(t == cudaSuccess);
                int numcols = n / lowprime;
                int numrows = primeEndInd - primeStartInd;
                int threadx = min(numcols, THREADS_PER_BLOCK);
                int thready = min(numrows, THREADS_PER_BLOCK / threadx);
                int blockx = min(numcols / threadx, MAX_BLOCKS);
                int blocky = min(numrows / thready, MAX_BLOCKS / blockx);
                dim3 gridsize(blockx, blocky);
                dim3 blocksize(threadx, thready);
                clearMultiples<<<gridsize, blocksize>>>(isPrime, primeList,
                        primeStartInd, primeEndInd, n);
                t = cudaDeviceSynchronize();
                assert(t == cudaSuccess);
            }
            lb = ub;
            ub *= 2;
            if (lb >= bound) {
                ub = n;
            }
            numBlocks = min(MAX_BLOCKS,
                    (ub - lb + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
    
            createCounts<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, addend, lb, ub);
            t = cudaDeviceSynchronize();
            assert(t == cudaSuccess);
    
            sumCounts<<<THREADS_PER_BLOCK, 1>>>(isPrime, addend, lb, ub, numPrimes);
            t = cudaDeviceSynchronize();
            assert(t == cudaSuccess);
    
            int primeCount;
            t = cudaMemcpy(&primeCount, numPrimes, sizeof(int),
                    cudaMemcpyDeviceToHost);
            assert(t == cudaSuccess);
            assert(primeCount > 0);
    
            primeStartInd = primeEndInd;
            primeEndInd += primeCount;
            condensePrimes<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, primeList, lb,
                    ub, primeStartInd, primeCount);
            t = cudaDeviceSynchronize();
            assert(t == cudaSuccess);
        }
        int finalprimes[primeEndInd];
        t = cudaMemcpy(finalprimes, primeList, primeEndInd * sizeof(int),
                cudaMemcpyDeviceToHost);
        assert(t == cudaSuccess);
    
        t = cudaFree(isPrime);
        assert(t == cudaSuccess);
        t = cudaFree(addend);
        assert(t == cudaSuccess);
        t = cudaFree(numPrimes);
        assert(t == cudaSuccess);
        t = cudaFree(primeList);
        assert(t == cudaSuccess);
        for (int i = 0; i < primeEndInd; i++) {
            if (i % 16 == 0)
                printf("\n");
            else
                printf(" ");
            printf("%4d", finalprimes[i]);
        }
        printf("\n");
        return 0;
    }
    

1 个答案:

答案 0 :(得分:1)

回答你的一些问题。

  1. 修复评论中定义的错误检查。

  2. 定义“并发读取”的含义。你对此很担心,但我不确定你的意思。

  3.   

    是否有必要在每次内核调用后调用synchronize?

    不,不是。如果您的代码无法正常工作,则在每次内核调用之后进行同步,然后进行正确的错误检查将告诉您是否有任何内核未正确启动。对于像这样的相对简单的单流程序,通常不需要同步。需要同步的cuda调用如cudaMemcpy会自动为你执行此操作。

      

    我还需要在memcpy之后同步吗?

    不,cudaMemcpy本质上是同步的(它会强制同一个流中的所有cuda调用在它开始之前完成,并且在副本完成之前它不会将控制权返回给主机线程。)如果你不想要阻塞特性(在完成之前不将控制权返回给主机线程)然后你可以使用cudaMemcpyAsync版本的调用。您可以使用流来绕过强制所有以前的cuda调用完成的行为。

      

    最后,为什么当我将THREADS_PER_BLOCK设置为512时它不起作用?

    请用“它不起作用”来定义你的意思。我用THREADS_PER_BLOCK编译了你的代码512和256,并且对于1000的上限它在每种情况下都给出了相同的输出。