我正在尝试使用CUDA为x,y的每个坐标实现KNN(k个最近邻居),如果我在最多7000个坐标上运行项目,之后它只是打印垃圾,是否有办法制作确保CUDA在调用memcpy
函数之前完成计算?
主:
runCuda(cudaAllCordDevice, cudaKNearestCord, numOfJobsPerSlave, knn, size, max, maxThreadNumber, numOfJobsForCuda);
if (cudaDeviceSynchronize() == cudaSuccess){
printf("cuda success");
}else
{
printf("cuda fail");
}
cudaFinishedCalculation = (Coordinate*)malloc((knn+1)*sizeof(Coordinate)*numOfJobsForCuda);
cudaMemcpy(cudaFinishedCalculation, cudaKNearestCord, numOfJobsForCuda*((knn+1)*sizeof(Coordinate)), cudaMemcpyDeviceToHost);
和我的内核(cu):
__global__ void calcNCoordinates(Coordinate* cudaAllCoordArr, Coordinate* cudaKNearest,int startIndex, int knn, int size, Coordinate max, int cudaSizeToCalc)
{
int i, j ,index;
Coordinate* cudaTempKNearest = (Coordinate*)malloc((knn+1)*sizeof(Coordinate));
int threadId = threadIdx.x; //get current thread Id
index = threadId+startIndex;
while(threadId < cudaSizeToCalc) //while treadId is smaller then the size to calc for cuda
{
calcKnnPerCoodinate(cudaAllCoordArr, cudaTempKNearest, knn, size, index, max);
for(j = 0 ; j < knn+1 ; j++)
{
cudaKNearest[j+threadId*(knn+1)] = cudaTempKNearest[j];
}
threadId += blockDim.x; ////number of threads running in a block - 1024 in afeka pc
index = threadId+startIndex; //update the current index
}
free(cudaTempKNearest);
}
void runCuda(Coordinate* cudaAllCoordArr, Coordinate* cudaKNearest,int startIndex, int knn, int size, Coordinate max, int maxNumberOfThreads, int numOfJobsForCuda)
{
calcNCoordinates<<<1, maxNumberOfThreads>>>(cudaAllCoordArr, cudaKNearest, startIndex, knn, size, max, numOfJobsForCuda);
}
如果我尝试使用最多7k坐标运行它会打印成功,否则打印失败,我尝试cudaSyncronized()
。我需要程序继续计算,直到它完成。