Question

我正在编写一个CUDA实现来生成＆＃34;魔术位板＆＃34;对于国际象棋引擎。我有一个有效的CUDA版本，但我试图对其进行优化。最后我有两个数组，＆＃39;问题＆＃39;并且＆＃39;回答＆＃39;并且我试图生成一个幻数以乘以任何特定问题，并得到一个散列索引。然后我使用该索引在表中查找值。目标是生成索引0- [n]，或生成完美的散列＆＃39;功能

但是，我认为主要问题是访问内存。以下代码部分确实返回了一个幻数，但它不正确。

代码的断面部分如下所示：

int index = threadIdx.x;
__shared__ int magic_good;
__shared__ u64_t magic_number;
// only try 1000 times, as many times this algorithm can't ever succeed
// more iterations are done at a higher level
for (int tries = 0; tries < 1000; tries++) {
    resulting_magic_moves[index] = 0xffffffffffffffff;
    magic_good = 1;
    if (index == 0) { // only generate one random number
        magic_number = random(); // using KISS as a random number generator, actually
    }

    __syncthreads();
   int magic_index = (question[index] * magic_number) >> (64 - questions_bits);
   // this 'magic_index' is basically just a random number at this point
   // as a result, I need some sort of locking on this array...
   resulting_magic_moves[magic_index] = answers[index]; // this could be set by multiple threads
   // this is my attempt to deal with the locking
   __syncthreads();
   if (resulting_magic_moves[magic_index] != answers[index]) {
        magic_good = 0;
    }
    __syncthreads();
    if (magic_good) // set to one at the beginning
        break; // it worked!  Or did it?
}
if (magic_good && index == 0)
    *in_magic_number = magic_number;

我相信数组resul_magic_moves [magic_index]正在每个线程中设置，并且编译器正在记住它放置了[index]＆＃39;到那个地方，没有其他部分会修改它。我确实将指针声明为＆＃39; volitile＆＃39;作为测试，但是没有用。

我使用gridDim为1，blockDim等于问题和答案的数量。我的调用代码如下：

magic_brute_force<<<1, questions>>>(d_magic_number, d_magic_moves_table, questions_bits, d_questions, d_answers, usec (a seed));

，其中

questions == 1<<questions_bits.  (The nature of the algorithm forces a power of 2 questions and answers).

如何正确锁定这一点的任何帮助将不胜感激。我有一个不同的版本，每个线程生成自己的编号并检查每个索引本身，但这似乎不是CUDA方式。

编辑：上面是一个简化版本，完整代码发布在下面，从文件中读取问题和答案，并注释掉功能性CUDA内核：

#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <sys/time.h>

#include "fileio.c" // compile it into one file... kinda ugly, but whatever...

typedef unsigned long long bb_t;

typedef struct {
    bb_t magic_number;
    int bits;
} magic_t;

// random number generator stolen from somewhere...
#define znew(m) (m[0]=36969*(m[0]&65535)+(m[0]>>16)) // z
#define wnew(m) (m[1]=18000*(m[1]&65535)+(m[1]>>16)) // w
#define MWC(m) ((znew(m)<<16)+wnew(m)) // znew, wnew
#define SHR3(m) (m[2]^=(m[2]<<17), m[2]^=(m[2]>>13), m[2]^=(m[2]<<5)) // jsr
#define CONG(m) (m[3]=69069*m[3]+1234567) //jcong
#define KISS(m) ((MWC(m)^CONG(m))+SHR3(m)) // MWC, CONG, SHR3
#define SETUP_KISS(m, seed) m[0] = seed; m[1] = (seed ^ 0xdeadbeef); m[2] = m[1] ^ 0x12345678; m[3] = m[1] + m[2]

#define LONG_KISS(m) ((((bb_t)KISS(m))<<32)|((bb_t)KISS(m)))
#define LONG_KISS_FEWBITS(m) LONG_KISS(m) & LONG_KISS(m) & LONG_KISS(m)

typedef unsigned long UL;

#define magic_perform(magic, occ) (((magic).magic_number * (occ)) >> (64 - (magic).bits))
const bb_t bitboard_universe = 0xffffffffffffffff;

#define TRIES 100

__global__ void magic_brute_force(magic_t *magic_answer, volatile bb_t *resulting_magic_moves,
                              const int questions_bits, const int desired_bits,
                              const bb_t *question, const bb_t *answer, int seed) {
/*    int desired_size = 1<<desired_bits;

    int index = threadIdx.x;
    int questions = 1<<questions_bits;
    UL random_state1[4];
    SETUP_KISS(random_state1, ((index+1)*seed));
    magic_t my_magic;
    my_magic.bits = desired_bits;
    bb_t too_small_table[512];
    int tries = TRIES;
    while (magic_answer->bits == 0 && tries--) {
    my_magic.magic_number = LONG_KISS_FEWBITS(random_state1);
    for (int a = 0; a < desired_size; a++) {
        too_small_table[a] = bitboard_universe;
    }
    int q;
    for (q = 0; q < questions; q++) {
        int index = magic_perform(my_magic, question[q]);
        if (too_small_table[index] == bitboard_universe) {
            too_small_table[index] = answer[q];
        } else if (answer[q] != too_small_table[index]) {
            break;
        }
    }
    if (q == questions) {
        *magic_answer = my_magic;
    }
    __syncthreads();
    }
    if (magic_answer->magic_number == my_magic.magic_number) {
    for (int a = 0; a < desired_size; a++) {
        resulting_magic_moves[a] = too_small_table[a];
    }
    }
*/
    int index = threadIdx.x;
    int desired_size = 1<<desired_bits;
    __shared__ int magic_good;
    magic_good = 0;
    UL rs[4];
    SETUP_KISS(rs, seed);
    __shared__ magic_t tmp_magic;
    __syncthreads();
    for (int tries = 0; tries < TRIES; tries++) {
    if (index < desired_size)
        resulting_magic_moves[index] = bitboard_universe;
    magic_good = 1;
    if (index == 0) {
        tmp_magic.magic_number = LONG_KISS_FEWBITS(rs);
        tmp_magic.bits = desired_bits;
    }
    __syncthreads();
    int magic_index = magic_perform(tmp_magic, question[index]);
    resulting_magic_moves[magic_index] = answer[index];
    __syncthreads();
    if (resulting_magic_moves[magic_index] != answer[index])
        magic_good = 0;
    if (index < desired_size)
        resulting_magic_moves[index] = bitboard_universe;
    __syncthreads();
    if (magic_good) {
        if (index == 0)
            *magic_answer = tmp_magic;
        break;
    }
    }
}

/**
 * Host main routine
 */
int main(int argc, char **argv)
{
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;

    if (argc != 3) {
    printf("Usage %s filename answers_bits\n", argv[0]);
    exit(1);
    }

    FILE *file = fopen(argv[1], "r");
    if (!file) {
    printf("Couldn't open %s\n", argv[1]);
    fclose(file);
    exit(1);
    }
    int questions_bits;
    full_read(file, &questions_bits, sizeof(int));
    int answers_bits = atoi(argv[2]);

    size_t questions_size = 1<<questions_bits;
    size_t answers_size = 1<<answers_bits;
    printf("Magic number generation decrease by %d orders of magnitude\n", questions_bits - answers_bits);

    bb_t h_questions[questions_size];
    bb_t h_answers[questions_size];
    magic_t h_magic_answer;
    bb_t h_magic_moves[answers_size];

    printf("questions size: %i\nquestions bits: %i\nanswers size: %i\nannswers_bits: %i\n",
        questions_size, questions_bits, answers_size, answers_bits);

    // this simplifies the code below a little bit...
    questions_size *= sizeof(bb_t);
    answers_size *= sizeof(bb_t);
    if (full_read(file, h_questions, questions_size) ||
    full_read(file, h_answers, questions_size)) {
    printf("Couldn't read from %s\n", argv[1]);
    exit(1);
    }
    fclose(file);

    // Allocate the device memory
    bb_t *d_questions, *d_answers, *d_magic_moves;
    magic_t *d_magic_answer;
    UL *d_random_state;

    int mem_error;

    // this relies on short circuiting
    mem_error = ((err = cudaMalloc(&d_questions, questions_size)) != cudaSuccess) ||
            ((err = cudaMalloc(&d_answers, questions_size)) != cudaSuccess) ||
            ((err = cudaMalloc(&d_magic_moves, answers_size)) != cudaSuccess) ||
            ((err = cudaMalloc(&d_magic_answer, sizeof(magic_t))) != cudaSuccess) ||
            ((err = cudaMalloc(&d_random_state, 4*sizeof(UL))) != cudaSuccess);
    if (mem_error)
    {
    fprintf(stderr, "Failed to allocate device vector (error code %s)!\n", cudaGetErrorString(err));
    return -1;
    }

    printf("Copy input data from the host memory to the CUDA device\n");
    mem_error = ((err = cudaMemcpy(d_questions, h_questions, questions_size, cudaMemcpyHostToDevice)) != cudaSuccess) ||
            ((err = cudaMemcpy(d_answers, h_answers, questions_size, cudaMemcpyHostToDevice)) != cudaSuccess) ||
            ((err = cudaMemset(d_magic_answer, 0, sizeof(magic_t))) != cudaSuccess) ||
            ((err = cudaMemset(d_random_state, 0xff, sizeof(UL)*4)) != cudaSuccess);

    if (mem_error)
    {
    fprintf(stderr, "Failed to copy vector from host to device (error code %s)!\n", cudaGetErrorString(err));
    return -1;
    }

    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = questions_size/sizeof(bb_t);
    int blocksPerGrid = 1; // (questions_size + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    struct timeval start, end;
    bb_t usec = 0;
    struct timezone tz;
    h_magic_answer.bits = 0;
    bb_t total_iterations = 0;
    gettimeofday(&start, &tz);
    while (h_magic_answer.bits == 0) {
    magic_brute_force<<<blocksPerGrid, threadsPerBlock>>>(d_magic_answer, d_magic_moves, questions_bits, answers_bits, d_questions, d_answers, usec);
    err = cudaGetLastError();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch brute force kernel (error code %s)!\n", cudaGetErrorString(err));
        return -1;
    }

    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    mem_error = ((err = cudaMemcpy(&h_magic_answer, d_magic_answer, sizeof(magic_t), cudaMemcpyDeviceToHost)) != cudaSuccess);

    if (mem_error)
    {
        fprintf(stderr, "Failed to copy magic_answer from device to host (error code %s)!\n", cudaGetErrorString(err));
        return -1;
    }
    gettimeofday(&end, &tz);
    usec = (((bb_t)end.tv_sec*1000000)+end.tv_usec)-(((bb_t)start.tv_sec)*1000000+start.tv_usec);
    int iterations =  TRIES*threadsPerBlock*blocksPerGrid;
    total_iterations += iterations;
    // as it turns out it/usec == million it/sec, unit wise...
    float million_it_per_sec = ((float)total_iterations)/usec;
    if (total_iterations / 1000000 != (total_iterations - iterations) / 1000000) {
        printf("Ran in %lf million iterations in %lld.%03lld seconds at %f million it/sec\n", ((double)total_iterations)/1000000, usec / 1000000, usec % 1000, million_it_per_sec);
    }
    }

    mem_error = ((err = cudaMemcpy(h_magic_moves, d_magic_moves, answers_size, cudaMemcpyDeviceToHost)) != cudaSuccess);

    if (mem_error)
    {
    fprintf(stderr, "Failed to copy magic_moves from device to host (error code %s)!\n", cudaGetErrorString(err));
    return -1;
    }
    printf("Testing magic numbers\n");
    questions_size /= sizeof(bb_t);
    int i;
    printf("Magic Number: %i\n", h_magic_answer.magic_number);
    printf("Magic Bits: %i\n", h_magic_answer.bits);
    for (i = 0; i < questions_size; i++) {
    if (h_answers[i] != h_magic_moves[magic_perform(h_magic_answer, h_questions[i])])
        break;
    }
    if (i == questions_size)
    printf("Test PASSED\n");
    else
    printf("Test FAILED!\n");

    // Free device global memory
    mem_error = ((err = cudaFree(d_random_state)) != cudaSuccess) ||
            ((err = cudaFree(d_magic_answer)) != cudaSuccess) ||
            ((err = cudaFree(d_magic_moves)) != cudaSuccess) ||
            ((err = cudaFree(d_questions)) != cudaSuccess) ||
            ((err = cudaFree(d_answers)) != cudaSuccess);

    if (mem_error)
    {
    fprintf(stderr, "Failed to free device memory (error code %s)!\n", cudaGetErrorString(err));
    return -1;
    }

    // Reset the device and exit
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    err = cudaDeviceReset();

    if (err != cudaSuccess)
    {
    fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
    return -1;
    }

    printf("Done\n");
    return 0;
}

双重编辑： cuda-memcheck --tool racecheck返回以下输出：

========= ERROR: Potential RAW hazard detected at __shared__ 0x3 in block (0, 0, 0) :
=========     Write Thread (145, 0, 0) at 0x00000158 in magic_brute_force(magic_t*, __int64 volatile *, int, int, __in
t64 const *, __int64 const *, int)
=========     Read Thread (330, 0, 0) at 0x00000a88 in magic_brute_force(magic_t*, __int64 volatile *, int, int, __int
64 const *, __int64 const *, int)
=========     Current Value : 0
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/libcuda.so (cuLaunchKernel + 0x331) [0x138371]
=========     Host Frame:./magic [0x1aea8]
=========     Host Frame:./magic [0x3b7e3]
=========     Host Frame:./magic [0x35e3]
=========     Host Frame:./magic [0x344a]
=========     Host Frame:./magic [0x3490]
=========     Host Frame:./magic [0x2ddd]
=========     Host Frame:/usr/lib/libc.so.6 (__libc_start_main + 0xf0) [0x20000]
=========     Host Frame:./magic [0x2599]

它打印出来很多。

Triple Edit：

我解决了自己的问题。如果你看一下代码的第二部分，你会注意到我错误地写了函数末尾的resul_magic_moves数组，其中包含：

resulting_magic_moves[index] = bitboard_universe;

因此破坏了我的结果。感谢所有的投入！

如果您只是在开头设置它，而不是在结尾处设置它，它可以正常工作（尽管比赛报告了内存访问问题）。

然而，第一个内核确实表现得更好，因此没有我希望的那么多进展。无论如何，谢谢你的帮助。

Answer 1

事实上，这不是cuda共享内存重叠问题。已发布的代码有效，并且不需要锁定。该实现包含一个逻辑错误，其中一个变量被重置为默认值。

从本质上讲，在magic_brute_force循环结束时你会看到：

if (index < desired_size)
    resulting_magic_moves[index] = bitboard_universe;

只是删除它以创建一个可用的内核。

我在问题本身中发布了更多信息，包括实际的代码示例和解决方案，以防其他人对CUDA内存问题感到疑惑。在这种情况下，简化的代码示例是处理cuda共享内存问题的有效方法。

线程写入重叠时访问CUDA中的共享内存

1 个答案: