Question

我使用以下代码来测量代码的运行时间;

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/find.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/distance.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <thrust/pair.h>
#include <thrust/remove.h>

#include <math.h>
#include <fstream>
#include <string>
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
using namespace std;

const int MINCOUNTS = 20;
const int h = 10;
const int dim = 2;
//const int h2 = pow(double(h),double(dim));



struct DataType
{
    float d[dim];
};





void loadData(thrust::host_vector<DataType>& D_,string dir_, DataType& gt)
{



    fstream in(dir_.c_str(),ios::in);
    string tline;
    string::size_type position;
    getline(in,tline);

    int flag = atoi(tline.c_str());
    if(flag != 1)
    {
        cout<<"there is problem in file : "<<dir_<<endl;
        exit(-1);
    }

    getline(in,tline);
    int tot = atoi(tline.c_str());

    getline(in,tline);




    for(int i = 0; i < dim - 1; i++)
    {
        position = tline.find(" ");
        gt.d[i] = atof(tline.substr(0,position).c_str());
        tline = tline.substr(position+1, tline.size() - position);
    }
    gt.d[dim-1] = atof(tline.c_str());

    DataType dt;
    for(int i = 0; i < tot-1; i++)
    {
        getline(in,tline);
        for(int i = 0; i < dim - 1; i++)
        {
            position = tline.find(" ");
            dt.d[i] = atof(tline.substr(0,position).c_str());
            tline = tline.substr(position+1, tline.size() - position);
        }
        dt.d[dim-1] = atof(tline.c_str());
        D_.push_back(dt);
    }
}



__global__ void initialSM(int *gpu_Mchanged1, int *gpu_Schanged1,int N)
{
    int index = blockIdx.x;
    if(index < N)
    {
        gpu_Mchanged1[index] = index;
        gpu_Schanged1[index] = index;
    }

}


//parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N, h);
__global__ void parallelCal(int* gpu_Schanged1, DataType *input, DataType *msPoint, int tot) // h is the band-width of the kernel function;
{

    int index = blockIdx.x;
    int dis = 0;

    int ii = 0;
    int i0 = 0;

    int inlierNum = 0;
    //  double h2 = 10000;

    if(index < tot)
    {
        dis = 0;
        i0 = gpu_Schanged1[index];

        for(unsigned int i = 0; i < dim; i++)
            msPoint[index].d[i] = 0;

        for(int i = 0 ;i < tot ;i++)
        {
            ii = gpu_Schanged1[i];


            dis = 0;
            for(unsigned int j = 0; j < dim; j++)
            {
                dis += (input[i0].d[j] - input[ii].d[j])*(input[i0].d[j] - input[ii].d[j]);
                if(dis > pow(double(h),2.0))
                    break;
            }

            if (dis < pow(double(h),2.0))
            {
                inlierNum++;
                for(unsigned int j = 0; j < dim; j++)
                    msPoint[index].d[j] += (input[ii].d[j] - input[i0].d[j]);
            }
        }


        //      msPoint[index].d[0] = inlierNum;
        for(unsigned int j = 0; j < dim; j++)
        {
            msPoint[index].d[j] /= inlierNum;
            msPoint[index].d[j] += input[i0].d[j];
        }

    }
}


//nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);
__global__ void nearestSearch(int *gpu_Schanged1,int *gpu_Mchanged1, DataType *msPoint, DataType *input, int tot, int *Sunchanged, int *Munchanged)
{
    int index = blockIdx.x;
    float dis = 0;
    float disMin = 1000000;
    int flag = -1;
    int i1;
    if(index < tot)
    {

        for(int i = 0; i < tot; i++)
        {
            i1 = gpu_Schanged1[i];

            dis = 0;
            for(int j = 0; j < dim; j++)
                dis += (msPoint[index].d[j] - input[i1].d[j])*(msPoint[index].d[j] - input[i1].d[j]);

            if(dis <= disMin)
            {
                disMin = dis;
                flag = i1;
            }
        }
        Sunchanged[gpu_Schanged1[index]] = index;
        Munchanged[gpu_Schanged1[index]] = flag;
        gpu_Mchanged1[index] = flag;
    }
}
////routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);
__global__ void routineTransmission(bool loop1st, int *gpu_Schanged1,int *gpu_Mchanged1, int *gpu_Sunchanged,int *gpu_Munchanged, const int tot)
{
    int index = blockIdx.x;
    bool find2 = false;

    if(index < tot)
    {
        int lastOne = -1;
        int thisOne = -1;
        int indexIter = index;
        while(1)
        {



            if(loop1st)
            {
                lastOne = gpu_Mchanged1[indexIter];
                thisOne = gpu_Mchanged1[lastOne];

                if(lastOne == thisOne)
                {
                    gpu_Munchanged[gpu_Schanged1[index]] = thisOne;
                    gpu_Mchanged1[index] = thisOne;
                    break;
                }
                indexIter = thisOne;
            }

            else
            {
                //              gpu_Mchanged1[index] = gpu_Schanged1[index];

                while(1)
                {
                    lastOne = gpu_Mchanged1[indexIter];
                    for(int i = 0; i < tot; i++)
                    {
                        if(i == indexIter)
                            continue;

                        if(lastOne == gpu_Schanged1[i])
                        {
                            thisOne = i;
                            find2 = true;
                            break;
                        }
                    }
                    if(find2 == false)
                        break;
                    indexIter = thisOne;
                    find2 = false;

                }
                if(thisOne != index && thisOne != -1)
                {
                    gpu_Munchanged[gpu_Schanged1[index]] = gpu_Schanged1[thisOne];
                    gpu_Mchanged1[index] = gpu_Schanged1[thisOne];
                }
                break;
            }
        }
    }

}
//

__global__ void deleteCircle(int *gpu_Mchanged1, int *gpu_Munchanged, const int N, bool loop1st)
{
    int index = blockIdx.x;
    int router0, router1;
    if(index < N)
    {
        if(loop1st)
        {
            router0 = gpu_Mchanged1[index];
            router1 = gpu_Mchanged1[router0];
            while(1)
            {

                if(index == router0 || index == router1)
                {
                    gpu_Munchanged[index] = index;
                    break;
                }
                if(router0 == router1)
                    break;
                router0 = gpu_Mchanged1[router1];
                router1 = gpu_Mchanged1[router0];
            }
        }

    }


}
__global__ void checkTheClusterSize(int *gpu_Mchanged1, int *gpu_Schanged1, int *gpu_Munchanged, int *gpu_clusterSize, int smallTot, int tot)
{
    int index = blockIdx.x;
    if(index < smallTot)
    {
        int count = 0;
        for(unsigned int i = 0; i < tot; i++)
        {
            if(gpu_Munchanged[i] == gpu_Mchanged1[index])
                count++;
        }
        gpu_clusterSize[index] = count;
        if(count <= MINCOUNTS)
            gpu_Schanged1[index] = -1;
    }

}
__global__ void checkTheCenterNum(int *gpu_Munchanged,int *gpu_Sunchanged, int *gpu_Kcounts ,int tot)
{
    int index = blockIdx.x;
    if(index < tot)
    {
        if (gpu_Kcounts[gpu_Munchanged[index]] < MINCOUNTS)
        {
            gpu_Sunchanged[index] = -1;
        }
    }


}

struct increaseOne: public thrust::unary_function<int, int>
{
    int operator()(int a_){return a_++;}

};
//
__global__ void mergeCentreSimple(int* gpu_Munchanged, int *gpu_clusterSize, DataType* gpu_input,int *gpu_Schanged1, int *gpu_Mchanged1, int tot)
{
    int index = blockIdx.x;
    float dis = 0;
    float disMin = pow(double(h/2),2.0);
    int disMinIndex = -1;
    bool flag = false;
    if(index < tot)
    {
        for(unsigned int i = 0; i < tot; i++)
        {
            if(index == i)
                continue;


            dis = 0;
            for(unsigned int j = 0; j < dim; j++)
                dis += (gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j])*(gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j]);
            //          dis = (gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)*(gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)+(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2)*(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2);

            if(dis < disMin)
            {
                flag = true;
                disMin = dis;
                disMinIndex = i;
            }
        }

        if(flag)
            if(gpu_clusterSize[index] < gpu_clusterSize[disMinIndex])
            {
                gpu_Munchanged[gpu_Schanged1[index]] = gpu_Mchanged1[disMinIndex];
                gpu_Mchanged1[index] = gpu_Mchanged1[disMinIndex];

            }
    }
}



struct is_minus_one
{
    __host__ __device__
    bool operator()(const int x)
    {
        return(x == -1);
    }
};

typedef thrust::device_vector<int>::iterator dintiter;

int main(int argc, char** argv)
{
    //  int h = 100;
    using namespace std;
    thrust::host_vector<DataType> host_input;
    //  string dir = "/home/gaoy/cuda-workspace/DATA/input/dataMS/data_1.txt";
    string dir = "/home/gaoy/workspace/DATA/dataInput/gaussianDistribution_2500.txt";
    DataType gt;
    loadData(host_input,dir, gt);
    cudaEvent_t start,stop;
    float time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int loopTime = 100;
    float timeSum = 0;

    thrust::device_vector<DataType> device_input = host_input;  // Host端vector
    int N = device_input.size();
    int rN = N;
    int lastSize, thisSize;

    DataType *gpu_input;
    gpu_input = thrust::raw_pointer_cast(&device_input[0]);

    thrust::device_vector<DataType> device_msPoint;
    device_msPoint.resize(N);
    DataType *gpu_msPoint;

    thrust::device_vector<int> device_Sunchanged;
    device_Sunchanged.resize(N);
    int *gpu_Sunchanged;
    gpu_Sunchanged = thrust::raw_pointer_cast(&device_Sunchanged[0]);

    thrust::device_vector<int> device_Munchanged;
    device_Munchanged.resize(N);
    int *gpu_Munchanged;
    gpu_Munchanged = thrust::raw_pointer_cast(&device_Munchanged[0]);

    thrust::device_vector<int> device_Schanged1;
    device_Schanged1.resize(N);
    int *gpu_Schanged1;
    gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);

    thrust::device_vector<int> device_Mchanged1;
    device_Mchanged1.resize(N);
    int *gpu_Mchanged1;
    gpu_Mchanged1 = thrust::raw_pointer_cast(&device_Mchanged1[0]);

    thrust::pair<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> new_end;

    thrust::device_vector<int> device_clusterSize;

    initialSM<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,N);

    bool loop1st = true;
    dintiter Mend, Send, Cend;
    int *gpu_clusterSize;
    gpu_msPoint = thrust::raw_pointer_cast(&device_msPoint[0]);




    for(int i = 0; i < loopTime; i++)
    {

        cudaFree(0);
        cudaEventRecord(start,0);


        while(1)
        {
            lastSize = device_Schanged1.size();
            N = lastSize;
            device_msPoint.resize(N);

            parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N); //the size of the gpu_msPoint is as the same as the gpu_Mchanged1; but the gpu_input is the original data size
            device_Mchanged1.resize(N);
            nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);

            routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);


            thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            //
            new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            N = new_end.first - device_Mchanged1.begin();
            device_Mchanged1.resize(N);
            device_Schanged1.resize(N);

            device_clusterSize.clear();
            device_clusterSize.resize(N);

            gpu_clusterSize = thrust::raw_pointer_cast(&device_clusterSize[0]);
            checkTheClusterSize<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,gpu_Munchanged, gpu_clusterSize,N,rN);

            Mend = thrust::remove_if(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin(),is_minus_one());
            Cend = thrust::remove_if(device_clusterSize.begin(), device_clusterSize.end(), device_Schanged1.begin(), is_minus_one());
            Send = thrust::remove(device_Schanged1.begin(), device_Schanged1.end(), -1);

            N =  Send - device_Schanged1.begin();
            device_Schanged1.resize(N);
            device_Mchanged1.resize(N);
            device_clusterSize.resize(N);
            mergeCentreSimple<<<N,1>>>(gpu_Munchanged,gpu_clusterSize, gpu_input, gpu_Schanged1, gpu_Mchanged1, N);
            thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            N =  new_end.first - device_Mchanged1.begin();
            device_Mchanged1.resize(N);
            device_Schanged1.resize(N);


            thisSize = N;
            if(lastSize == thisSize)
                break;
            loop1st = false;

            thrust::copy(device_Mchanged1.begin(),device_Mchanged1.end(),device_Schanged1.begin());
            device_Mchanged1.clear();
            gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);
        }
        cudaEventRecord(stop,0);
        cudaEventSynchronize(stop);

        cudaEventElapsedTime(&time, start, stop);
        //      for(unsigned int ii = 0; ii < device_Mchanged1.size(); ii++)
        //          cout<<ii<<" "<<host_input[device_Schanged1[ii]].d[0]<<" "<<host_input[device_Schanged1[ii]].d[1]<<endl;

        timeSum += time;
        cout<<i<<" "<<time<<endl;
    }
    cout<<"elapsed: "<<timeSum/loopTime<<" ms"<<endl;









    return 0;


}

每个循环中变量time的输出都不一样，这就是我得到的结果：

但每次我测试的代码都做同样的事情。哪个结果我应该相信？我真的对此感到困惑。感谢。

Answer 1

我没有发现您的计时方法有任何明显的问题，因为您已经发布了实际的代码。关于cudaFree(0);和启动时间的陈述是无关紧要的，因为您的代码已经在第一个时序之前创建了一个cuda上下文。

我无法运行您的代码，因为它依赖于我没有的数据文件。但是，对时间变化的最可能的解释是，在时间或工作中存在实际变化。即使您似乎运行相同的代码，但从运行到运行可能需要不同的时间。

这可能是一些例子（我不是说你的代码也是如此;我不知道）：

thrust :: sort_by_key将花费不同的时间，如果序列已经排序，而不是未排序的序列。自从你正在进行就地排序，问题是你是否正在排序已经排序的数据。第一次通过你的定时循环可能会对数据进行排序，而后续的传递则可以可能正在对已经排序的数据进行排序，这将花费更少的时间。
另一个例子是您的所有.resize(N)操作。它似乎对我来说，第一次通过循环，这些可能会做一些实际调整大小，而在后续传递中，如果N没有改变，然后没有实际的大小调整，因此操作花费的时间更少。

同样，我不知道这些假设是否适用于您的代码，我只是指出在某些情况下如何重复运行相同的代码序列，并观察时序的变化

显然，由于代码是相同的，因此问题就变成了从运行到运行分析数据以查看它是否相同的问题。一个有趣的测试可能是在遇到while(1)指令之前跟踪break;循环的遍历次数。对于正在发生的事情，这也可能具有指导意义。

我使用cudaEvent来测量代码的运行时间。但我发现了一些谜题

1 个答案: