我正在一个项目中,我必须在gpu上实现apriori算法以获得最佳性能。我有两个内核代码:
__global__ void generate_L(int d_input_array[],int check_array[], int
out_of_range)
{
int index = blockIdx.x*blockDim.x + threadIdx.x;
if((index<out_of_range))
{
if(d_input_array[index] >= MIN_SUP)
{
check_array[index] = 1;
}
else
{
check_array[index] = 0;
}
}
}
第二:
__device__ int EQUAL;
__global__ void set_count_on_gpu(int a[], int b[], int a_size, int
b_size)
{
int index = blockIdx.x*blockDim.x + threadIdx.x;
if (b_size<=a_size)
{
for (int j = 0; j < a_size; ++j)
{
if(b[index]==a[j])
{
EQUAL++;
break;
}
}
}
}
内核调用部分:
int * a_array = a.data();
int array_size = C.begin()->first.size();
int * b_array;// = (int *)malloc(C.begin()->first.size() *
sizeof(int));
int * d_a_array;
int * d_b_array;
cudaMalloc((void**)&d_b_array, array_size * sizeof(int));
cudaMalloc((void**)&d_a_array, a.size()*sizeof(int));
cudaMemcpy(d_a_array, a_array,
a.size()*sizeof(int),cudaMemcpyHostToDevice);
FOR_MAP(ii,C)
{
VI b;
b.clear();
b=ii->first;
b_array = b.data();
cudaMemcpy(d_b_array, b_array, array_size *
sizeof(int),cudaMemcpyHostToDevice);
int block_num = 1;
int eq = 0;
cudaMemcpyToSymbol(EQUAL,&eq,sizeof(int));
set_count_on_gpu<<<block_num,array_size>>>(d_a_array,d_b_array,
a.size(),array_size);
cudaMemcpyFromSymbol(&eq, EQUAL,sizeof(int));
if (eq==b.size())
{
ii->second++;
}
}
第二次通话:
while(true)
{
if(index>7)
break;
generate_C();
if(C.size()==0)
break;
cout<<"\nC"<<index<<"\n";
output(C);
prune();
if (C.size()==0)
{
break;
}
cout<<"\nC"<<index<<" after prune \n";
output(C);
scan_D();
cout<<"\nC"<<index<<"after scaning dataset \n";
output(C);
L.clear();
int * check_array = (int *) malloc(C.size()*sizeof(int));
int * input_array = (int *) malloc(C.size()*sizeof(int));
int * d_check_array;
int * d_input_array;
cudaMalloc((void**)&d_check_array, C.size()*sizeof(int));
cudaMalloc((void**)&d_input_array, C.size()*sizeof(int));
FOR_MAP(ii,C)
{
input_array[thr] = ii->second;
thr++;
}
cudaMemcpy(d_input_array,
input_array,C.size()*sizeof(int),cudaMemcpyHostToDevice);
int block_num = 1;
if(C.size() < MAX_THREADS)
{
generate_L<<<block_num,C.size()>>>
(d_input_array,d_check_array, C.size());
}
else
{
block_num = C.size()/MAX_THREADS;
generate_L<<<block_num,C.size()/block_num>>>
(d_input_array,d_check_array, C.size());
}
cudaMemcpy(check_array,
d_check_array,C.size()*sizeof(int),cudaMemcpyDeviceToHost);
thr = 0;
FOR_MAP(ii,C)
{
if(check_array[thr] == 1)
{
L[ii->first] = ii->second;
}
thr++;
}
if (L.size()==0)
{
break;
}
cout<<"\nL"<<index<<"\n";
output(L);
index++;
thr = 0;
free(check_array);
cudaFree(d_check_array);
free(input_array);
cudaFree(d_input_array);
//generate_L();
}
我试图在不使用gpu的情况下运行实现,但是测量表明,在cpu代码上运行速度更快。
每个内核调用的块和线程大小都不相同。
这是我修改了Apriori-Implementation-On-Cpu的cpu实现。
如何优化它?