我使用以下代码来测量代码的运行时间;
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/find.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/distance.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <thrust/pair.h>
#include <thrust/remove.h>
#include <math.h>
#include <fstream>
#include <string>
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
using namespace std;
const int MINCOUNTS = 20;
const int h = 10;
const int dim = 2;
//const int h2 = pow(double(h),double(dim));
struct DataType
{
float d[dim];
};
void loadData(thrust::host_vector<DataType>& D_,string dir_, DataType& gt)
{
fstream in(dir_.c_str(),ios::in);
string tline;
string::size_type position;
getline(in,tline);
int flag = atoi(tline.c_str());
if(flag != 1)
{
cout<<"there is problem in file : "<<dir_<<endl;
exit(-1);
}
getline(in,tline);
int tot = atoi(tline.c_str());
getline(in,tline);
for(int i = 0; i < dim - 1; i++)
{
position = tline.find(" ");
gt.d[i] = atof(tline.substr(0,position).c_str());
tline = tline.substr(position+1, tline.size() - position);
}
gt.d[dim-1] = atof(tline.c_str());
DataType dt;
for(int i = 0; i < tot-1; i++)
{
getline(in,tline);
for(int i = 0; i < dim - 1; i++)
{
position = tline.find(" ");
dt.d[i] = atof(tline.substr(0,position).c_str());
tline = tline.substr(position+1, tline.size() - position);
}
dt.d[dim-1] = atof(tline.c_str());
D_.push_back(dt);
}
}
__global__ void initialSM(int *gpu_Mchanged1, int *gpu_Schanged1,int N)
{
int index = blockIdx.x;
if(index < N)
{
gpu_Mchanged1[index] = index;
gpu_Schanged1[index] = index;
}
}
//parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N, h);
__global__ void parallelCal(int* gpu_Schanged1, DataType *input, DataType *msPoint, int tot) // h is the band-width of the kernel function;
{
int index = blockIdx.x;
int dis = 0;
int ii = 0;
int i0 = 0;
int inlierNum = 0;
// double h2 = 10000;
if(index < tot)
{
dis = 0;
i0 = gpu_Schanged1[index];
for(unsigned int i = 0; i < dim; i++)
msPoint[index].d[i] = 0;
for(int i = 0 ;i < tot ;i++)
{
ii = gpu_Schanged1[i];
dis = 0;
for(unsigned int j = 0; j < dim; j++)
{
dis += (input[i0].d[j] - input[ii].d[j])*(input[i0].d[j] - input[ii].d[j]);
if(dis > pow(double(h),2.0))
break;
}
if (dis < pow(double(h),2.0))
{
inlierNum++;
for(unsigned int j = 0; j < dim; j++)
msPoint[index].d[j] += (input[ii].d[j] - input[i0].d[j]);
}
}
// msPoint[index].d[0] = inlierNum;
for(unsigned int j = 0; j < dim; j++)
{
msPoint[index].d[j] /= inlierNum;
msPoint[index].d[j] += input[i0].d[j];
}
}
}
//nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);
__global__ void nearestSearch(int *gpu_Schanged1,int *gpu_Mchanged1, DataType *msPoint, DataType *input, int tot, int *Sunchanged, int *Munchanged)
{
int index = blockIdx.x;
float dis = 0;
float disMin = 1000000;
int flag = -1;
int i1;
if(index < tot)
{
for(int i = 0; i < tot; i++)
{
i1 = gpu_Schanged1[i];
dis = 0;
for(int j = 0; j < dim; j++)
dis += (msPoint[index].d[j] - input[i1].d[j])*(msPoint[index].d[j] - input[i1].d[j]);
if(dis <= disMin)
{
disMin = dis;
flag = i1;
}
}
Sunchanged[gpu_Schanged1[index]] = index;
Munchanged[gpu_Schanged1[index]] = flag;
gpu_Mchanged1[index] = flag;
}
}
////routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);
__global__ void routineTransmission(bool loop1st, int *gpu_Schanged1,int *gpu_Mchanged1, int *gpu_Sunchanged,int *gpu_Munchanged, const int tot)
{
int index = blockIdx.x;
bool find2 = false;
if(index < tot)
{
int lastOne = -1;
int thisOne = -1;
int indexIter = index;
while(1)
{
if(loop1st)
{
lastOne = gpu_Mchanged1[indexIter];
thisOne = gpu_Mchanged1[lastOne];
if(lastOne == thisOne)
{
gpu_Munchanged[gpu_Schanged1[index]] = thisOne;
gpu_Mchanged1[index] = thisOne;
break;
}
indexIter = thisOne;
}
else
{
// gpu_Mchanged1[index] = gpu_Schanged1[index];
while(1)
{
lastOne = gpu_Mchanged1[indexIter];
for(int i = 0; i < tot; i++)
{
if(i == indexIter)
continue;
if(lastOne == gpu_Schanged1[i])
{
thisOne = i;
find2 = true;
break;
}
}
if(find2 == false)
break;
indexIter = thisOne;
find2 = false;
}
if(thisOne != index && thisOne != -1)
{
gpu_Munchanged[gpu_Schanged1[index]] = gpu_Schanged1[thisOne];
gpu_Mchanged1[index] = gpu_Schanged1[thisOne];
}
break;
}
}
}
}
//
__global__ void deleteCircle(int *gpu_Mchanged1, int *gpu_Munchanged, const int N, bool loop1st)
{
int index = blockIdx.x;
int router0, router1;
if(index < N)
{
if(loop1st)
{
router0 = gpu_Mchanged1[index];
router1 = gpu_Mchanged1[router0];
while(1)
{
if(index == router0 || index == router1)
{
gpu_Munchanged[index] = index;
break;
}
if(router0 == router1)
break;
router0 = gpu_Mchanged1[router1];
router1 = gpu_Mchanged1[router0];
}
}
}
}
__global__ void checkTheClusterSize(int *gpu_Mchanged1, int *gpu_Schanged1, int *gpu_Munchanged, int *gpu_clusterSize, int smallTot, int tot)
{
int index = blockIdx.x;
if(index < smallTot)
{
int count = 0;
for(unsigned int i = 0; i < tot; i++)
{
if(gpu_Munchanged[i] == gpu_Mchanged1[index])
count++;
}
gpu_clusterSize[index] = count;
if(count <= MINCOUNTS)
gpu_Schanged1[index] = -1;
}
}
__global__ void checkTheCenterNum(int *gpu_Munchanged,int *gpu_Sunchanged, int *gpu_Kcounts ,int tot)
{
int index = blockIdx.x;
if(index < tot)
{
if (gpu_Kcounts[gpu_Munchanged[index]] < MINCOUNTS)
{
gpu_Sunchanged[index] = -1;
}
}
}
struct increaseOne: public thrust::unary_function<int, int>
{
int operator()(int a_){return a_++;}
};
//
__global__ void mergeCentreSimple(int* gpu_Munchanged, int *gpu_clusterSize, DataType* gpu_input,int *gpu_Schanged1, int *gpu_Mchanged1, int tot)
{
int index = blockIdx.x;
float dis = 0;
float disMin = pow(double(h/2),2.0);
int disMinIndex = -1;
bool flag = false;
if(index < tot)
{
for(unsigned int i = 0; i < tot; i++)
{
if(index == i)
continue;
dis = 0;
for(unsigned int j = 0; j < dim; j++)
dis += (gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j])*(gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j]);
// dis = (gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)*(gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)+(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2)*(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2);
if(dis < disMin)
{
flag = true;
disMin = dis;
disMinIndex = i;
}
}
if(flag)
if(gpu_clusterSize[index] < gpu_clusterSize[disMinIndex])
{
gpu_Munchanged[gpu_Schanged1[index]] = gpu_Mchanged1[disMinIndex];
gpu_Mchanged1[index] = gpu_Mchanged1[disMinIndex];
}
}
}
struct is_minus_one
{
__host__ __device__
bool operator()(const int x)
{
return(x == -1);
}
};
typedef thrust::device_vector<int>::iterator dintiter;
int main(int argc, char** argv)
{
// int h = 100;
using namespace std;
thrust::host_vector<DataType> host_input;
// string dir = "/home/gaoy/cuda-workspace/DATA/input/dataMS/data_1.txt";
string dir = "/home/gaoy/workspace/DATA/dataInput/gaussianDistribution_2500.txt";
DataType gt;
loadData(host_input,dir, gt);
cudaEvent_t start,stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int loopTime = 100;
float timeSum = 0;
thrust::device_vector<DataType> device_input = host_input; // Host端vector
int N = device_input.size();
int rN = N;
int lastSize, thisSize;
DataType *gpu_input;
gpu_input = thrust::raw_pointer_cast(&device_input[0]);
thrust::device_vector<DataType> device_msPoint;
device_msPoint.resize(N);
DataType *gpu_msPoint;
thrust::device_vector<int> device_Sunchanged;
device_Sunchanged.resize(N);
int *gpu_Sunchanged;
gpu_Sunchanged = thrust::raw_pointer_cast(&device_Sunchanged[0]);
thrust::device_vector<int> device_Munchanged;
device_Munchanged.resize(N);
int *gpu_Munchanged;
gpu_Munchanged = thrust::raw_pointer_cast(&device_Munchanged[0]);
thrust::device_vector<int> device_Schanged1;
device_Schanged1.resize(N);
int *gpu_Schanged1;
gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);
thrust::device_vector<int> device_Mchanged1;
device_Mchanged1.resize(N);
int *gpu_Mchanged1;
gpu_Mchanged1 = thrust::raw_pointer_cast(&device_Mchanged1[0]);
thrust::pair<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> new_end;
thrust::device_vector<int> device_clusterSize;
initialSM<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,N);
bool loop1st = true;
dintiter Mend, Send, Cend;
int *gpu_clusterSize;
gpu_msPoint = thrust::raw_pointer_cast(&device_msPoint[0]);
for(int i = 0; i < loopTime; i++)
{
cudaFree(0);
cudaEventRecord(start,0);
while(1)
{
lastSize = device_Schanged1.size();
N = lastSize;
device_msPoint.resize(N);
parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N); //the size of the gpu_msPoint is as the same as the gpu_Mchanged1; but the gpu_input is the original data size
device_Mchanged1.resize(N);
nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);
routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);
thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
//
new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
N = new_end.first - device_Mchanged1.begin();
device_Mchanged1.resize(N);
device_Schanged1.resize(N);
device_clusterSize.clear();
device_clusterSize.resize(N);
gpu_clusterSize = thrust::raw_pointer_cast(&device_clusterSize[0]);
checkTheClusterSize<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,gpu_Munchanged, gpu_clusterSize,N,rN);
Mend = thrust::remove_if(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin(),is_minus_one());
Cend = thrust::remove_if(device_clusterSize.begin(), device_clusterSize.end(), device_Schanged1.begin(), is_minus_one());
Send = thrust::remove(device_Schanged1.begin(), device_Schanged1.end(), -1);
N = Send - device_Schanged1.begin();
device_Schanged1.resize(N);
device_Mchanged1.resize(N);
device_clusterSize.resize(N);
mergeCentreSimple<<<N,1>>>(gpu_Munchanged,gpu_clusterSize, gpu_input, gpu_Schanged1, gpu_Mchanged1, N);
thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
N = new_end.first - device_Mchanged1.begin();
device_Mchanged1.resize(N);
device_Schanged1.resize(N);
thisSize = N;
if(lastSize == thisSize)
break;
loop1st = false;
thrust::copy(device_Mchanged1.begin(),device_Mchanged1.end(),device_Schanged1.begin());
device_Mchanged1.clear();
gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
// for(unsigned int ii = 0; ii < device_Mchanged1.size(); ii++)
// cout<<ii<<" "<<host_input[device_Schanged1[ii]].d[0]<<" "<<host_input[device_Schanged1[ii]].d[1]<<endl;
timeSum += time;
cout<<i<<" "<<time<<endl;
}
cout<<"elapsed: "<<timeSum/loopTime<<" ms"<<endl;
return 0;
}
每个循环中变量time的输出都不一样,这就是我得到的结果:
0 385.722
1 3.67507
2 3.64183
3 2.40269
但每次我测试的代码都做同样的事情。 哪个结果我应该相信? 我真的对此感到困惑。 感谢。
答案 0 :(得分:1)
我没有发现您的计时方法有任何明显的问题,因为您已经发布了实际的代码。关于cudaFree(0);
和启动时间的陈述是无关紧要的,因为您的代码已经在第一个时序之前创建了一个cuda上下文。
我无法运行您的代码,因为它依赖于我没有的数据文件。但是,对时间变化的最可能的解释是,在时间或工作中存在实际变化。即使您似乎运行相同的代码,但从运行到运行可能需要不同的时间。
这可能是一些例子(我不是说你的代码也是如此;我不知道):
thrust :: sort_by_key将花费不同的时间,如果 序列已经排序,而不是未排序的序列。自从你 正在进行就地排序,问题是你是否 正在排序已经排序的数据。第一次通过 你的定时循环可能会对数据进行排序,而后续的传递则可以 可能正在对已经排序的数据进行排序,这将花费更少的时间。
另一个例子是您的所有.resize(N)
操作。它似乎
对我来说,第一次通过循环,这些可能会做一些
实际调整大小,而在后续传递中,如果N
没有改变,
然后没有实际的大小调整,因此操作花费的时间更少。
同样,我不知道这些假设是否适用于您的代码,我只是指出在某些情况下如何重复运行相同的代码序列,并观察时序的变化
显然,由于代码是相同的,因此问题就变成了从运行到运行分析数据以查看它是否相同的问题。一个有趣的测试可能是在遇到while(1)
指令之前跟踪break;
循环的遍历次数。对于正在发生的事情,这也可能具有指导意义。