Question

我正在使用Cuda和C ++来做一些并行计算。最近，我发现了一些我无法理解的东西，在寻找它时我没有找到有关它的信息。在我的代码中，一行很少被执行（但需要），即使它根本没有被执行也会减慢程序的速度。以下是一些使其更清晰的代码：

我创建的课程：

class Foo 
{ 
void myFunction(Foo *listFoo);
//some other functions that I need
...
int myAttribute;
//some other attributes that I need
...
}

myFunction的定义：

void Foo::myFunction(Foo *listFoo)
{
//do some computations on the listFoo
if( condition seldom verified )
  {  myAttribute = myAttribute + 1; }
}

全球职能：

__global__ void compute(Foo *listFoo, int numberOfFoo)
{
   int i = threadIdx.x + blockIdx.x * blockDim.x;
   if( i < numberOfFoo)
   {  listFoo[i].myFunction(listFoo); }
}

主持人代码：

compute<<<(numberOfFoo + 511)/512, 512>>> (listFoo, numberOfFoo)

减慢所有内容的行是myAttribute = myAttribute + 1.即使执行了0次，与将行放入注释时相比，代码也非常慢。我试图用一个简单的printf替换这一行。结果是一样的，该行永远不会被执行，但它会减慢一切。

如果您对原因有任何建议，并最终如何解决此问题，我们将非常感激。我的编程水平不是很高级，所以请使用相对简单的解释。

非常感谢

首次编辑：很少有人请求代码，所以现在就是这样！我将它减少到700行，我知道它仍然很长但是如果我继续删除它的某些部分，那就不会有多少工作。它编译没有问题我。您只需按Enter键，等待几秒钟，所需的时间将显示在命令窗口中。

问题发生在函数findContactwithGrain（）中。 addContact（grainContact）行正在减慢一切。在我的计算机上，如果此行处于活动状态，则一次计算大约需要3.5秒。如果我把它放在评论中，则需要0.07秒。对于一条从未执行的行来说，这是一个巨大的差异。

希望这有助于理解问题

#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <fstream> // to read and write files
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <string>
#include <sstream>
#define n  200
using namespace std;

int global_totalNumberBlock = 0;
int global_totalNumberGrain = 0;

//tell the compiler that those classes exist
class Vec3d2;
class Block;
class Grain;
class Contact;
class Analysis;

class Vec3d2
{
public:
    __host__ __device__ Vec3d2(void);
    __host__ __device__ Vec3d2(double x_value, double y_value, double z_value);
    __host__ __device__ ~Vec3d2(void);
    __host__ __device__ double dot(Vec3d2 a) const;
    __host__ __device__ Vec3d2 cross(Vec3d2 a) const;
    __host__ __device__ double norm() const;
    __host__ __device__ void normalize();
    // to be able to use cout easily
    __host__ __device__ friend ostream & operator <<(ostream &s,const Vec3d2 &vec) 
    { 
        s << vec.x << endl;
        s << vec.y << endl;
        s << vec.z << endl;
        return s; 
    } 
    //to be able to use brackets
    __host__ __device__ double operator [](int i) const 
    {
        if( i == 0)
        {
            return x;
        }
        else if( i == 1)
        {
            return y;
        }
        else if( i == 2)
        {
            return z;
        }
        else 
        {
            cout << "ERROR IN USING VEC3D2" << endl;
            system("PAUSE");
        }
    }
    __host__ __device__ double & operator [](int i) 
    {
        if( i == 0)
        {
            return x;
        }
        else if( i == 1)
        {
            return y;
        }
        else if( i == 2)
        {
            return z;
        }
        else 
        {
            cout << "ERROR IN USING VEC3D2" << endl;
            system("PAUSE");
        }
    }

    //attributes
    double x, y, z;
};

//Class Vec3d2 functions and operators
Vec3d2::Vec3d2()
{
    x = 0;
    y = 0;
    z = 0;
}
Vec3d2::Vec3d2(double x_value, double y_value, double z_value)
{
    x =  x_value;
    y =  y_value;
    z =  z_value;
}
Vec3d2::~Vec3d2()
{
}
double Vec3d2::dot(Vec3d2 a) const
{
    return x*a.x + y*a.y + z*a.z;
}
Vec3d2 Vec3d2::cross(Vec3d2 a) const
{
    Vec3d2 result( y*a.z - z*a.y, x*a.z - z*a.x, x*a.y - y*a.x);
    return result;
}
double Vec3d2::norm() const
{
    return sqrt((double) x*x + y*y + z*z);
}
void Vec3d2::normalize()
{
    double norm = this->norm();
    if (norm > 0)
    {
        x = x/norm;
        y = y/norm;
        z = z/norm;
    }
    else //the vector has a null norm so nothing to do
    {
    }
}
__host__ __device__ Vec3d2 operator+(Vec3d2 const& a, Vec3d2 const& b)
{
    return Vec3d2(a.x + b.x, a.y + b.y, a.z + b.z);
}
__host__ __device__ Vec3d2 operator-(Vec3d2 const& a, Vec3d2 const& b)
{
    return Vec3d2(a.x - b.x, a.y - b.y, a.z - b.z);
}
__host__ __device__ Vec3d2 operator*(Vec3d2 const& a, double const& b)
{
    return Vec3d2(b*a.x, b*a.y, b*a.z);
}
__host__ __device__ Vec3d2 operator*(double const& b, Vec3d2 const& a)
{
    return Vec3d2(b*a.x, b*a.y, b*a.z);
}
__host__ __device__ Vec3d2 operator/(Vec3d2 const& a, double const& b)
{
    return Vec3d2(a.x/b, a.y/b, a.z/b);
}
__host__ __device__ Vec3d2 operator/(double const& b, Vec3d2 const& a)
{
    return Vec3d2(a.x/b, a.y/b, a.z/b);
}
__host__ __device__ bool operator==(Vec3d2 const& a, Vec3d2 const& b)
{
    if(a.x == b.x && a.y == b.y && a.z == b.z)
    {
        return true;
    }
    else
    {
        return false;
    }
}
__host__ __device__ bool operator!=(Vec3d2 const& a, Vec3d2 const& b)
{
    if( a.x != b.x || a.y != b.y || a.z != b.z)
    {
        return true;
    }
    else
    {
        return false;
    }
}

class Contact
{
public:
     __host__ __device__ Contact(void);
     //__host__ __device__ Contact(Contact const& ContactToCopy);
     __host__ __device__ ~Contact(void);
     __host__ __device__ void setContact(Grain &grain1, Grain &grain2, double overlap_value);
};

class Block
{
public:
     __host__ Block(void);
     __host__ Block(Block const& BlockToCopy);
     __host__ __device__ ~Block(void);

     __host__ __device__ Contact* getContactList() const;
     __host__ __device__ Contact** getContactListPtr();
     __host__ __device__ int getMaxNumberContact() const;
     __host__ __device__ int getNumberContact() const;

     __host__ __device__ void setContactList(Contact *ptr);
     __host__ __device__ void addContact(Contact contact_value);
     __host__ __device__ void clearContactList();// empty the contactList
     __host__ __device__ void deleteBlockData(); //clear the memory taken by the contactList
     __host__ __device__ Block& operator=(Block const& BlockToCopy);

protected:
    int Id; //unique Id number for each entity  double mass;

    int totalNumberBlock; //same value for each block, cannot use static attribute because of cuda
    Contact *contactList;
    int numberContact, old_numberContact; //because there is no way to find it from the pointer contactList
    int maxNumberContact; //maximum number of contact per block, we have to choose this
};

class Grain: public Block
{
public:
     __host__ Grain(void);
     __host__ Grain(Grain const& grainToCopy);
     __host__ Grain(Vec3d2 position_value, double radius_value, double mass_value);
     __host__ __device__ ~Grain(void);

     __host__ __device__ Vec3d2 getPositionVec() const;
     __host__ __device__ Vec3d2* getPosition() const;
     __host__ __device__ Vec3d2** getPositionPtr();
     __host__ __device__ int getTotalNumberGrain() const;

     __host__ void setTotalNumberGrain();
     __host__ __device__ void setTotalNumberGrain(int number);
     __host__ __device__ void setPosition(Vec3d2 *ptr);
     __host__ __device__ void setPositionVec(Vec3d2 position_value);
     __host__ __device__ void deleteGrainData();

     __host__ __device__ void findContactwithGrain(Grain *grainList);

     __host__ __device__ Grain& operator=(Grain const& grainToCopy);
     __host__ __device__ friend ostream & operator <<(ostream &s,const Grain &grain) 
     { 
         s <<"position is" << endl;
         s << *grain.position << endl;
         s <<"grain number is" << endl;
         s << grain.number << endl;
         s <<"radius is" << endl;
         s << grain.radius << endl;
         s <<"mass is" << endl;
         return s; 
     } 
private:
    Vec3d2 *position;

    int totalNumberGrain;
    int number; //different from Id defined in class Block because a wall could have the same number as a grain
    double radius;
};

class Analysis
{
public:
    Analysis(void);
    Analysis(Grain *grainList);
    ~Analysis(void);
    Grain* getGrainList();
    void copyToDevice();
    void copyToHost();
    void runAnalysis();
private: 
    //should contain grainList, wallList and their equivalent for the device
    //should contain an array of pointers for each attribute being a pointer in grain and wall and their equivalent in the device
    int totalNumberGrain, totalNumberWall;

    Grain *grainList, *d_grainList;

    //for grain data
    Contact **grain_contactList, **d_grain_contactList;
    Vec3d2 **grain_position, **d_grain_position;
};

//class Contact functions
Contact::Contact(void)
{
}
Contact::~Contact(void)
{
}
void Contact::setContact(Grain &grain1, Grain &grain2, double overlap_value)//we are in grain1 and contact with grain2
{
}

//class Block functions
Block::Block(void)
{
    Id = global_totalNumberBlock;
    numberContact = 0;
    old_numberContact = 0;
    //contact list settings
    maxNumberContact = 30;
    contactList = new Contact[maxNumberContact];
    //increment of block number
    global_totalNumberBlock = global_totalNumberBlock + 1;
}
Block::~Block(void)
{
    delete[] contactList;

    //cout << "CAREFUL, YOU ARE DESTROYING A BLOCK" << endl;//because we should never erase a block
    //system("PAUSE");
    totalNumberBlock = totalNumberBlock - 1;
}
Block::Block(Block const& BlockToCopy)
{
    Id = BlockToCopy.Id;

    numberContact = BlockToCopy.numberContact;
    old_numberContact = BlockToCopy.old_numberContact;
    maxNumberContact = BlockToCopy.maxNumberContact;

    contactList = new Contact[maxNumberContact];

    for(int i =0; i <numberContact; i++)
    {
        contactList[i] = BlockToCopy.contactList[i];
    }

}

Contact* Block::getContactList() const
{
    return contactList;
}
Contact** Block::getContactListPtr()
{
    return &contactList;
}
int Block::getMaxNumberContact() const
{
    return maxNumberContact;
}
int Block::getNumberContact() const
{
    return numberContact;
}
void Block::setContactList(Contact *ptr)
{
    //no "delete contactList" here because this is executed after cuda. The contactList is pointing to nothing and deleteing it will cause an error
    contactList = ptr;
}

void Block::addContact(Contact contact_value)
{
    if(numberContact < maxNumberContact)
    {
        contactList[numberContact] = contact_value;
        numberContact = numberContact + 1;
    }
    else //find a way to throw an error because the list is too small for all the contacts
    {
        printf("TOO MANY CONTACTS ON ONE GRAIN");
    }
}
void Block::clearContactList()
{
    //delete[] contactList;
    //contactList = new Contact[maxNumberContact];
    if(numberContact > 0)
    {
        numberContact = 0;
    }
}
void Block::deleteBlockData()
{
    delete[] contactList;
}

__host__ __device__ Block& Block::operator=(Block const& BlockToCopy)
{
    if(this != &BlockToCopy) //to check we are not doing a = a
    {
        Id = BlockToCopy.Id;

        numberContact = BlockToCopy.numberContact;
        old_numberContact = BlockToCopy.old_numberContact;
        maxNumberContact = BlockToCopy.maxNumberContact;

        delete[] contactList;

        contactList = new Contact[maxNumberContact];

        for(int i =0; i <numberContact; i++)
        {
            contactList[i] = BlockToCopy.contactList[i];
        }
    }
    return *this;
}

//class Grain functions
Grain::Grain(void)
{
    number = global_totalNumberGrain;
    global_totalNumberGrain = global_totalNumberGrain + 1;
    totalNumberGrain = -1;//safety
    //initialize Vec3d2
    position = new Vec3d2;
}
Grain::Grain(Grain const& grainToCopy)
{
    cout <<"COPY CONSTRUCTOR OF GRAIN IS NOT DONE YET"<<endl;
    system("PAUSE");
    //totalNumberGrain = grainToCopy.totalNumberGrain;
    //radius = grainToCopy.radius;
    //diameter = grainToCopy.diameter;
    //volume = grainToCopy.volume;
    //inertia = grainToCopy.inertia;

    //position = new Vec3d2;
    //old_position = new Vec3d2;
    //old_velocity = new Vec3d2;
    //old_acceleration = new Vec3d2;
    //old_angularVelocity = new Vec3d2;
    //old_angularAcceleration = new Vec3d2;
    //gravityForce = new Vec3d2;

    //*position = *grainToCopy.position;
    //*old_position = *grainToCopy.old_position;
    //*old_velocity = *grainToCopy.old_velocity;
    //*old_acceleration = *grainToCopy.old_acceleration;
    //*old_angularVelocity = *grainToCopy.old_angularVelocity;
    //*old_angularAcceleration = *grainToCopy.old_angularAcceleration;
    //*gravityForce = *grainToCopy.gravityForce;
}
Grain::Grain(Vec3d2 position_value, double radius_value,double mass_value)//, number(totalNumberGrain)
{
    number = global_totalNumberGrain;
    global_totalNumberGrain = global_totalNumberGrain + 1;
    totalNumberGrain = -1;//safety
    radius = radius_value;
    //initialize all the Vec3d2 parameters
    position = new Vec3d2;

    *position = position_value;

}
Grain::~Grain(void)
{
    //cout << "CAREFUL, YOU ARE DESTROYING A GRAIN" << endl;//because we should never erase a block
    //system("PAUSE");
    totalNumberGrain = totalNumberGrain - 1;

    delete position;

}
Vec3d2 Grain::getPositionVec() const
{
    return *position;
}
Vec3d2* Grain::getPosition() const
{
    return position;
}
Vec3d2** Grain::getPositionPtr()
{
    return &position;
}
int Grain::getTotalNumberGrain() const
{
    return totalNumberGrain;
}
void Grain::setTotalNumberGrain()
{
    totalNumberGrain = global_totalNumberGrain;
}
void Grain::setTotalNumberGrain(int number)
{
    totalNumberGrain = number;
}
void Grain::setPosition(Vec3d2 *ptr)
{
    position = ptr;
}
void Grain::setPositionVec(Vec3d2 position_value)
{
    *position = position_value;
}
void Grain::deleteGrainData()
{
    delete position;
}
void Grain::findContactwithGrain(Grain *grainList)
{
    for(int m = 0; m < n; m++)
    {
        double length;
        length = (*position - (*grainList[m].position)).norm();
        if( length < radius + grainList[m].radius)
        {
            if( number != grainList[m].number) //faster than number != sortedGrainList[m]
            {
                Vec3d2 relativePosition = *position - (*grainList[m].position) ;
                double overlap = radius + grainList[m].radius - relativePosition.norm();

                //define the contact
                Contact grainContact;           
                grainContact.setContact(*this, grainList[m], overlap);
                addContact(grainContact); //IF YOU PUT THIS LINE IN COMMENT, EVERYTHING GOES A LOT FASTER
            }
        }
    }
}
__host__ __device__ Grain& Grain::operator=(Grain const& grainToCopy)
{
    if(this != &grainToCopy)
    {
        Block::operator=(grainToCopy); //this lines call the operator = defined for Block. So it copies the block attributes of the first grain into the second grain

        //totalNumberGrain = grainToCopy.totalNumberGrain;
        radius = grainToCopy.radius;

        *position = *grainToCopy.position;
    }
    return *this;
}

//class Analysis functions
Analysis::Analysis(void)
{
}
Analysis::Analysis(Grain *grainList_value)
{
    totalNumberGrain = grainList_value[0].getTotalNumberGrain();

    grainList = new Grain[totalNumberGrain];

    //copy grains
    for(int i = 0; i < totalNumberGrain; i++)
    {
        grainList[i] = grainList_value[i];
        grainList[i].setTotalNumberGrain(grainList_value[i].getTotalNumberGrain());
    }
}
Analysis::~Analysis(void)
{
    delete[] grainList;

    //a lot more delete should be made here
}
Grain* Analysis::getGrainList()
{
    return grainList;
}
void Analysis::copyToDevice()
{
    //declare device grainList and wallList and copy the values
    cudaMalloc(&d_grainList, totalNumberGrain*sizeof(Grain));

    cudaMemcpy(d_grainList, grainList, totalNumberGrain*sizeof(Grain), cudaMemcpyHostToDevice);

    ////declare device list of pointer to pass pointer values of grain
    d_grain_contactList = new Contact*[totalNumberGrain];
    d_grain_position = new Vec3d2*[totalNumberGrain];

    for(int i = 0; i < totalNumberGrain; i++)
    {
        cudaMalloc(&d_grain_contactList[i], grainList[i].getMaxNumberContact()*sizeof(Contact));
        cudaMalloc(&d_grain_position[i], sizeof(Vec3d2));
    }

    //copy pointers and values for grains
    for(int i = 0; i < totalNumberGrain; i++)
    {
        //pointers
        cudaMemcpy(d_grainList[i].getContactListPtr(), &d_grain_contactList[i], sizeof(Contact*), cudaMemcpyHostToDevice);
        cudaMemcpy(d_grainList[i].getPositionPtr(), &d_grain_position[i], sizeof(Vec3d2*), cudaMemcpyHostToDevice);
        //values
        cudaMemcpy(d_grain_contactList[i], grainList[i].getContactList(), grainList[i].getMaxNumberContact()*sizeof(Contact), cudaMemcpyHostToDevice);
        cudaMemcpy(d_grain_position[i], grainList[i].getPosition(), sizeof(Vec3d2), cudaMemcpyHostToDevice);
    }

}
void Analysis::copyToHost()
{
    //delete the pointer value or it will create a memory leak
    for(int i = 0; i < totalNumberGrain; i++)
    {
        grainList[i].deleteBlockData();
        grainList[i].deleteGrainData();
    }

    //copy non pointer value
    cudaMemcpy(grainList, d_grainList, totalNumberGrain*sizeof(Grain),cudaMemcpyDeviceToHost);

    //copy pointer values for grains
    grain_contactList = new Contact*[totalNumberGrain];
    grain_position = new Vec3d2*[totalNumberGrain];

    for(int i = 0; i < totalNumberGrain; i++)
    {
        grain_contactList[i] = new Contact[grainList[i].getMaxNumberContact()];
        grain_position[i] = new Vec3d2;

        grainList[i].setContactList(grain_contactList[i]);
        grainList[i].setPosition(grain_position[i]);

        cudaMemcpy(grain_contactList[i], d_grain_contactList[i], grainList[i].getMaxNumberContact()*sizeof(Contact), cudaMemcpyDeviceToHost);
        cudaMemcpy(grain_position[i], d_grain_position[i], sizeof(Vec3d2), cudaMemcpyDeviceToHost);
    }
}

__global__ void compute( Grain *g)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    //__syncthreads();

    if( i < n )
    {   
        g[i].findContactwithGrain(g);
    }
}

void Analysis::runAnalysis()
{
    for(int i = 0; i < 3; i ++)
    {
        clock_t begin = clock();
        for(int j = 0; j < 10000; j++)
        {
            compute<<<(n + 511)/512, 512>>>(d_grainList);   
        }
        clock_t end = clock();

        cout << (double)(end-begin)/CLOCKS_PER_SEC << endl;

        system("PAUSE");
    }
}

int main(void)
{
    //grain
    Vec3d2 position1; position1[0] = 0;position1[1] = 0;position1[2] = 0;
    double radius1 = 1;

    ////cuda
    cout << "PRESS ENTER TO START" << endl;
    system("PAUSE");
    clock_t begin = clock();

    Grain *g, *d_g;
    g = new Grain[n];
    for(int i = 0; i<n; i++)
    {
        g[i].setTotalNumberGrain();
    }

    Grain g1(position1, radius1, 0.1);

    for(int i = 0; i <n; i++)
    {
        g[i] = g1;
        g[i].setPositionVec(Vec3d2(3*i+1.5, 1.5, 0));
    }

    Analysis a(g);
    a.copyToDevice();
    a.runAnalysis();

    clock_t end = clock();

    cout << (double)(end-begin)/CLOCKS_PER_SEC << endl;

  return 0;
}

Answer 1

我需要更多代码来验证，但最可能的解释是，当您不包含代码时，实际上并没有将任何数据写入全局内存。当你不向全局内存写任何东西时，nvcc会优化几乎所有东西，直到你什么都不会运行。

包含print语句时也是如此。打印语句被视为输出，因此nvcc无法编译出它所依赖的代码。

例如：

__global__ empty_kernel(int* huge_array, int num_elements){
    int local_memory;
    for(int i=0; i<num_elements; i++){
        local_memory+=huge_array[i];      
    }
}

的运行速度会快于：

__global__ empty_kernel(int* small_array, int num_elements, int* smaller_array){
    int tid = ThreadIdx.x+BlockIdx.x*BlockDim.x;
    int local_memory;
    for(int i=0; i<5; i++){
        local_memory+=huge_array[tid*i];      
    }
    smaller_array[tid]=local_memory;
}

最重要的是，你的第一个内核并不快，它只是没有被运行。

Answer 2

我认为问题只是if语句，而不是它有条件执行的语句。有条件的分支在GPU体系结构上可能相当昂贵（虽然它似乎在更新的体系结构中变得更好），并且只有一个分支语句肯定会减慢你的代码。

如果删除if子句中的语句，编译器会发现没有剩下代码，因此也可以优化if本身。所以这就是你删除这行代码时看到加速的原因。

Cuda：未执行的代码行减慢了程序

2 个答案: