我正在编写一个CUDA实现来生成"魔术位板"对于国际象棋引擎。我有一个有效的CUDA版本,但我试图对其进行优化。最后我有两个数组,'问题'并且'回答'并且我试图生成一个幻数以乘以任何特定问题,并得到一个散列索引。然后我使用该索引在表中查找值。目标是生成索引0- [n],或生成完美的散列'功能
但是,我认为主要问题是访问内存。以下代码部分确实返回了一个幻数,但它不正确。
代码的断面部分如下所示:
int index = threadIdx.x;
__shared__ int magic_good;
__shared__ u64_t magic_number;
// only try 1000 times, as many times this algorithm can't ever succeed
// more iterations are done at a higher level
for (int tries = 0; tries < 1000; tries++) {
resulting_magic_moves[index] = 0xffffffffffffffff;
magic_good = 1;
if (index == 0) { // only generate one random number
magic_number = random(); // using KISS as a random number generator, actually
}
__syncthreads();
int magic_index = (question[index] * magic_number) >> (64 - questions_bits);
// this 'magic_index' is basically just a random number at this point
// as a result, I need some sort of locking on this array...
resulting_magic_moves[magic_index] = answers[index]; // this could be set by multiple threads
// this is my attempt to deal with the locking
__syncthreads();
if (resulting_magic_moves[magic_index] != answers[index]) {
magic_good = 0;
}
__syncthreads();
if (magic_good) // set to one at the beginning
break; // it worked! Or did it?
}
if (magic_good && index == 0)
*in_magic_number = magic_number;
我相信数组resul_magic_moves [magic_index]正在每个线程中设置,并且编译器正在记住它放置了[index]&#39;到那个地方,没有其他部分会修改它。我确实将指针声明为&#39; volitile&#39;作为测试,但是没有用。
我使用gridDim为1,blockDim等于问题和答案的数量。我的调用代码如下:
magic_brute_force<<<1, questions>>>(d_magic_number, d_magic_moves_table, questions_bits, d_questions, d_answers, usec (a seed));
,其中
questions == 1<<questions_bits. (The nature of the algorithm forces a power of 2 questions and answers).
如何正确锁定这一点的任何帮助将不胜感激。我有一个不同的版本,每个线程生成自己的编号并检查每个索引本身,但这似乎不是CUDA方式。
编辑:上面是一个简化版本,完整代码发布在下面,从文件中读取问题和答案,并注释掉功能性CUDA内核:
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <sys/time.h>
#include "fileio.c" // compile it into one file... kinda ugly, but whatever...
typedef unsigned long long bb_t;
typedef struct {
bb_t magic_number;
int bits;
} magic_t;
// random number generator stolen from somewhere...
#define znew(m) (m[0]=36969*(m[0]&65535)+(m[0]>>16)) // z
#define wnew(m) (m[1]=18000*(m[1]&65535)+(m[1]>>16)) // w
#define MWC(m) ((znew(m)<<16)+wnew(m)) // znew, wnew
#define SHR3(m) (m[2]^=(m[2]<<17), m[2]^=(m[2]>>13), m[2]^=(m[2]<<5)) // jsr
#define CONG(m) (m[3]=69069*m[3]+1234567) //jcong
#define KISS(m) ((MWC(m)^CONG(m))+SHR3(m)) // MWC, CONG, SHR3
#define SETUP_KISS(m, seed) m[0] = seed; m[1] = (seed ^ 0xdeadbeef); m[2] = m[1] ^ 0x12345678; m[3] = m[1] + m[2]
#define LONG_KISS(m) ((((bb_t)KISS(m))<<32)|((bb_t)KISS(m)))
#define LONG_KISS_FEWBITS(m) LONG_KISS(m) & LONG_KISS(m) & LONG_KISS(m)
typedef unsigned long UL;
#define magic_perform(magic, occ) (((magic).magic_number * (occ)) >> (64 - (magic).bits))
const bb_t bitboard_universe = 0xffffffffffffffff;
#define TRIES 100
__global__ void magic_brute_force(magic_t *magic_answer, volatile bb_t *resulting_magic_moves,
const int questions_bits, const int desired_bits,
const bb_t *question, const bb_t *answer, int seed) {
/* int desired_size = 1<<desired_bits;
int index = threadIdx.x;
int questions = 1<<questions_bits;
UL random_state1[4];
SETUP_KISS(random_state1, ((index+1)*seed));
magic_t my_magic;
my_magic.bits = desired_bits;
bb_t too_small_table[512];
int tries = TRIES;
while (magic_answer->bits == 0 && tries--) {
my_magic.magic_number = LONG_KISS_FEWBITS(random_state1);
for (int a = 0; a < desired_size; a++) {
too_small_table[a] = bitboard_universe;
}
int q;
for (q = 0; q < questions; q++) {
int index = magic_perform(my_magic, question[q]);
if (too_small_table[index] == bitboard_universe) {
too_small_table[index] = answer[q];
} else if (answer[q] != too_small_table[index]) {
break;
}
}
if (q == questions) {
*magic_answer = my_magic;
}
__syncthreads();
}
if (magic_answer->magic_number == my_magic.magic_number) {
for (int a = 0; a < desired_size; a++) {
resulting_magic_moves[a] = too_small_table[a];
}
}
*/
int index = threadIdx.x;
int desired_size = 1<<desired_bits;
__shared__ int magic_good;
magic_good = 0;
UL rs[4];
SETUP_KISS(rs, seed);
__shared__ magic_t tmp_magic;
__syncthreads();
for (int tries = 0; tries < TRIES; tries++) {
if (index < desired_size)
resulting_magic_moves[index] = bitboard_universe;
magic_good = 1;
if (index == 0) {
tmp_magic.magic_number = LONG_KISS_FEWBITS(rs);
tmp_magic.bits = desired_bits;
}
__syncthreads();
int magic_index = magic_perform(tmp_magic, question[index]);
resulting_magic_moves[magic_index] = answer[index];
__syncthreads();
if (resulting_magic_moves[magic_index] != answer[index])
magic_good = 0;
if (index < desired_size)
resulting_magic_moves[index] = bitboard_universe;
__syncthreads();
if (magic_good) {
if (index == 0)
*magic_answer = tmp_magic;
break;
}
}
}
/**
* Host main routine
*/
int main(int argc, char **argv)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
if (argc != 3) {
printf("Usage %s filename answers_bits\n", argv[0]);
exit(1);
}
FILE *file = fopen(argv[1], "r");
if (!file) {
printf("Couldn't open %s\n", argv[1]);
fclose(file);
exit(1);
}
int questions_bits;
full_read(file, &questions_bits, sizeof(int));
int answers_bits = atoi(argv[2]);
size_t questions_size = 1<<questions_bits;
size_t answers_size = 1<<answers_bits;
printf("Magic number generation decrease by %d orders of magnitude\n", questions_bits - answers_bits);
bb_t h_questions[questions_size];
bb_t h_answers[questions_size];
magic_t h_magic_answer;
bb_t h_magic_moves[answers_size];
printf("questions size: %i\nquestions bits: %i\nanswers size: %i\nannswers_bits: %i\n",
questions_size, questions_bits, answers_size, answers_bits);
// this simplifies the code below a little bit...
questions_size *= sizeof(bb_t);
answers_size *= sizeof(bb_t);
if (full_read(file, h_questions, questions_size) ||
full_read(file, h_answers, questions_size)) {
printf("Couldn't read from %s\n", argv[1]);
exit(1);
}
fclose(file);
// Allocate the device memory
bb_t *d_questions, *d_answers, *d_magic_moves;
magic_t *d_magic_answer;
UL *d_random_state;
int mem_error;
// this relies on short circuiting
mem_error = ((err = cudaMalloc(&d_questions, questions_size)) != cudaSuccess) ||
((err = cudaMalloc(&d_answers, questions_size)) != cudaSuccess) ||
((err = cudaMalloc(&d_magic_moves, answers_size)) != cudaSuccess) ||
((err = cudaMalloc(&d_magic_answer, sizeof(magic_t))) != cudaSuccess) ||
((err = cudaMalloc(&d_random_state, 4*sizeof(UL))) != cudaSuccess);
if (mem_error)
{
fprintf(stderr, "Failed to allocate device vector (error code %s)!\n", cudaGetErrorString(err));
return -1;
}
printf("Copy input data from the host memory to the CUDA device\n");
mem_error = ((err = cudaMemcpy(d_questions, h_questions, questions_size, cudaMemcpyHostToDevice)) != cudaSuccess) ||
((err = cudaMemcpy(d_answers, h_answers, questions_size, cudaMemcpyHostToDevice)) != cudaSuccess) ||
((err = cudaMemset(d_magic_answer, 0, sizeof(magic_t))) != cudaSuccess) ||
((err = cudaMemset(d_random_state, 0xff, sizeof(UL)*4)) != cudaSuccess);
if (mem_error)
{
fprintf(stderr, "Failed to copy vector from host to device (error code %s)!\n", cudaGetErrorString(err));
return -1;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = questions_size/sizeof(bb_t);
int blocksPerGrid = 1; // (questions_size + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
struct timeval start, end;
bb_t usec = 0;
struct timezone tz;
h_magic_answer.bits = 0;
bb_t total_iterations = 0;
gettimeofday(&start, &tz);
while (h_magic_answer.bits == 0) {
magic_brute_force<<<blocksPerGrid, threadsPerBlock>>>(d_magic_answer, d_magic_moves, questions_bits, answers_bits, d_questions, d_answers, usec);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch brute force kernel (error code %s)!\n", cudaGetErrorString(err));
return -1;
}
// Copy the device result vector in device memory to the host result vector
// in host memory.
mem_error = ((err = cudaMemcpy(&h_magic_answer, d_magic_answer, sizeof(magic_t), cudaMemcpyDeviceToHost)) != cudaSuccess);
if (mem_error)
{
fprintf(stderr, "Failed to copy magic_answer from device to host (error code %s)!\n", cudaGetErrorString(err));
return -1;
}
gettimeofday(&end, &tz);
usec = (((bb_t)end.tv_sec*1000000)+end.tv_usec)-(((bb_t)start.tv_sec)*1000000+start.tv_usec);
int iterations = TRIES*threadsPerBlock*blocksPerGrid;
total_iterations += iterations;
// as it turns out it/usec == million it/sec, unit wise...
float million_it_per_sec = ((float)total_iterations)/usec;
if (total_iterations / 1000000 != (total_iterations - iterations) / 1000000) {
printf("Ran in %lf million iterations in %lld.%03lld seconds at %f million it/sec\n", ((double)total_iterations)/1000000, usec / 1000000, usec % 1000, million_it_per_sec);
}
}
mem_error = ((err = cudaMemcpy(h_magic_moves, d_magic_moves, answers_size, cudaMemcpyDeviceToHost)) != cudaSuccess);
if (mem_error)
{
fprintf(stderr, "Failed to copy magic_moves from device to host (error code %s)!\n", cudaGetErrorString(err));
return -1;
}
printf("Testing magic numbers\n");
questions_size /= sizeof(bb_t);
int i;
printf("Magic Number: %i\n", h_magic_answer.magic_number);
printf("Magic Bits: %i\n", h_magic_answer.bits);
for (i = 0; i < questions_size; i++) {
if (h_answers[i] != h_magic_moves[magic_perform(h_magic_answer, h_questions[i])])
break;
}
if (i == questions_size)
printf("Test PASSED\n");
else
printf("Test FAILED!\n");
// Free device global memory
mem_error = ((err = cudaFree(d_random_state)) != cudaSuccess) ||
((err = cudaFree(d_magic_answer)) != cudaSuccess) ||
((err = cudaFree(d_magic_moves)) != cudaSuccess) ||
((err = cudaFree(d_questions)) != cudaSuccess) ||
((err = cudaFree(d_answers)) != cudaSuccess);
if (mem_error)
{
fprintf(stderr, "Failed to free device memory (error code %s)!\n", cudaGetErrorString(err));
return -1;
}
// Reset the device and exit
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
err = cudaDeviceReset();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
return -1;
}
printf("Done\n");
return 0;
}
双重编辑: cuda-memcheck --tool racecheck返回以下输出:
========= ERROR: Potential RAW hazard detected at __shared__ 0x3 in block (0, 0, 0) :
========= Write Thread (145, 0, 0) at 0x00000158 in magic_brute_force(magic_t*, __int64 volatile *, int, int, __in
t64 const *, __int64 const *, int)
========= Read Thread (330, 0, 0) at 0x00000a88 in magic_brute_force(magic_t*, __int64 volatile *, int, int, __int
64 const *, __int64 const *, int)
========= Current Value : 0
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/libcuda.so (cuLaunchKernel + 0x331) [0x138371]
========= Host Frame:./magic [0x1aea8]
========= Host Frame:./magic [0x3b7e3]
========= Host Frame:./magic [0x35e3]
========= Host Frame:./magic [0x344a]
========= Host Frame:./magic [0x3490]
========= Host Frame:./magic [0x2ddd]
========= Host Frame:/usr/lib/libc.so.6 (__libc_start_main + 0xf0) [0x20000]
========= Host Frame:./magic [0x2599]
它打印出来很多。
Triple Edit:
我解决了自己的问题。如果你看一下代码的第二部分,你会注意到我错误地写了函数末尾的resul_magic_moves数组,其中包含:
resulting_magic_moves[index] = bitboard_universe;
因此破坏了我的结果。感谢所有的投入!
如果您只是在开头设置它,而不是在结尾处设置它,它可以正常工作(尽管比赛报告了内存访问问题)。
然而,第一个内核确实表现得更好,因此没有我希望的那么多进展。无论如何,谢谢你的帮助。
答案 0 :(得分:1)
事实上,这不是cuda共享内存重叠问题。已发布的代码有效,并且不需要锁定。该实现包含一个逻辑错误,其中一个变量被重置为默认值。
从本质上讲,在magic_brute_force循环结束时你会看到:
if (index < desired_size)
resulting_magic_moves[index] = bitboard_universe;
只是删除它以创建一个可用的内核。
我在问题本身中发布了更多信息,包括实际的代码示例和解决方案,以防其他人对CUDA内存问题感到疑惑。在这种情况下,简化的代码示例是处理cuda共享内存问题的有效方法。