我试图在openCL中为L系统毕达哥拉斯树编写并行算法:
var:A,B;
const: (,);
axiom:A;
rules:(B->BB),(A->B[A]A)
但我不能超过第9次迭代。第10次迭代返回无序字符串。这是我的内核:
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_amd_printf : enable
__kernel void l_system(int string_lenght){}
__kernel void l_system_interation(int string_lenght, __global char *sentence, __local char *string, __global int * local_char_num)
{
int local_x = (int)get_local_id(0);
int local_size = (int)get_local_size(0);
int x = (int)get_global_id(0);
int size = (int)get_global_size(0);
int group = (int)get_group_id(0);
int local_mem_index;
if(x < string_lenght){
//local mem index - offset for next group, copy char to local
local_mem_index = local_x * 5;
string[local_mem_index] = sentence[x];
if(local_x == 0){
//reset counter
atomic_xchg(&local_char_num[group], 0);
//atomic_add(&local_char_num[0], group);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if(x < string_lenght){
if(string[local_mem_index] == 'A'){
atomic_add(&local_char_num[group], 5);
string[local_mem_index] = 'B';
string[local_mem_index + 1] = '(';
string[local_mem_index + 2] = 'A';
string[local_mem_index + 3] = ')';
string[local_mem_index + 4] = 'A';
}
else if(string[local_mem_index] == 'B'){
atomic_add(&local_char_num[group], 2);
string[local_mem_index + 1] = 'B';
//reset 3rd char of local_mem
string[local_mem_index + 2] = '0';
}
else{
atomic_add(&local_char_num[group], 1);
//reset 3rd char of local_mem
string[local_mem_index + 2] = '0';
string[local_mem_index + 2] = '0';
}
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
//1 compute unit for every char from src
if(x < string_lenght){
//local first compute unit writes to result whole group string
if(local_x == 0){
int j = 0;
//find offset for write to result string
if(x != 0){
for(int l = 1;l <= group; l++)
{
j += atomic_xchg(&local_char_num[group-l], local_char_num[group-l]);
//if(l == 0)
}
atomic_xchg(&local_char_num[99+group], local_char_num[group]);
}
for(int i = 0; i < local_size; i++){
//only valid chars
if(string_lenght > (x+i)){
local_mem_index = i * 5;
//B rule, copy (,)
if(string[local_mem_index+2] != 'A'){
sentence[j++] = string[local_mem_index];
if(string[local_mem_index] == 'B'){
sentence[j++] = string[local_mem_index+1];
}
continue;//B,(,); next index;
}
else{ // A rule
sentence[j++] = string[local_mem_index];
sentence[j++] = string[local_mem_index+1];
sentence[j++] = string[local_mem_index+2];
sentence[j++] = string[local_mem_index+3];
sentence[j++] = string[local_mem_index+4];
}//if 'A'
//sentence[j] = 0;
}//if x+i
}//for
}// lx == 0
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
我认为,某些东西溢出,但无法找到...可能我的代码在主要内容有问题:
cl_int letter_count = 0;
cl_int next_letter_count = 1;
for (int i = 0; i < iter_count; i++)
{
//printf("%s\n", sentence_init);
letter_count = next_letter_count;
next_letter_count = STRING_LENGTH_PAR((i + 1));
printf("in count: %d out count: %d\n", letter_count, next_letter_count);
CheckOpenCLError(clSetKernelArg(kernel_iteration, 0, sizeof(cl_int), &letter_count), "clSetKernelArg: letter_count");
CheckOpenCLError(clSetKernelArg(kernel_iteration, 2, sizeof(cl_char)* (local * RULE_SIZE + 1), NULL), "clSetKernelArg: tmp_string");
CheckOpenCLError(clEnqueueNDRangeKernel(queue, kernel_iteration, 1, NULL, &global, &local, 0, NULL, &kernel_iteration_event), "clEnqueueNDRangeKernel: kernel_iteration");
CheckOpenCLError(clFinish(queue), "clFinish");
kernel_computing_time += getEventTime(kernel_iteration_event);
}
CheckOpenCLError(clEnqueueReadBuffer(queue, sentence_dev, CL_TRUE, 0, sizeof(cl_char)* (next_letter_count), sentence_result, 0, NULL, &result_iteration_event), "clEnqueueReadBuffer: result_iteration_event");
cl_int *p = (cl_int*)malloc(sizeof(cl_int)*(STRING_LENGTH_PAR(iter_count)));
CheckOpenCLError(clEnqueueReadBuffer(queue, p_dev, CL_TRUE, 0, sizeof(cl_int)* (STRING_LENGTH_PAR(iter_count)), p, 0, NULL, &result_iteration_event), "clEnqueueReadBuffer: result_iteration_event");