我正在使用Cuda和C ++来做一些并行计算。最近,我发现了一些我无法理解的东西,在寻找它时我没有找到有关它的信息。在我的代码中,一行很少被执行(但需要),即使它根本没有被执行也会减慢程序的速度。以下是一些使其更清晰的代码:
我创建的课程:
class Foo
{
void myFunction(Foo *listFoo);
//some other functions that I need
...
int myAttribute;
//some other attributes that I need
...
}
myFunction的定义:
void Foo::myFunction(Foo *listFoo)
{
//do some computations on the listFoo
if( condition seldom verified )
{ myAttribute = myAttribute + 1; }
}
全球职能:
__global__ void compute(Foo *listFoo, int numberOfFoo)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if( i < numberOfFoo)
{ listFoo[i].myFunction(listFoo); }
}
主持人代码:
compute<<<(numberOfFoo + 511)/512, 512>>> (listFoo, numberOfFoo)
减慢所有内容的行是myAttribute = myAttribute + 1.即使执行了0次,与将行放入注释时相比,代码也非常慢。我试图用一个简单的printf替换这一行。结果是一样的,该行永远不会被执行,但它会减慢一切。
如果您对原因有任何建议,并最终如何解决此问题,我们将非常感激。我的编程水平不是很高级,所以请使用相对简单的解释。
非常感谢
首次编辑:很少有人请求代码,所以现在就是这样!我将它减少到700行,我知道它仍然很长但是如果我继续删除它的某些部分,那就不会有多少工作。它编译没有问题我。您只需按Enter键,等待几秒钟,所需的时间将显示在命令窗口中。
问题发生在函数findContactwithGrain()中。 addContact(grainContact)行正在减慢一切。在我的计算机上,如果此行处于活动状态,则一次计算大约需要3.5秒。如果我把它放在评论中,则需要0.07秒。对于一条从未执行的行来说,这是一个巨大的差异。
希望这有助于理解问题
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <fstream> // to read and write files
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <string>
#include <sstream>
#define n 200
using namespace std;
int global_totalNumberBlock = 0;
int global_totalNumberGrain = 0;
//tell the compiler that those classes exist
class Vec3d2;
class Block;
class Grain;
class Contact;
class Analysis;
class Vec3d2
{
public:
__host__ __device__ Vec3d2(void);
__host__ __device__ Vec3d2(double x_value, double y_value, double z_value);
__host__ __device__ ~Vec3d2(void);
__host__ __device__ double dot(Vec3d2 a) const;
__host__ __device__ Vec3d2 cross(Vec3d2 a) const;
__host__ __device__ double norm() const;
__host__ __device__ void normalize();
// to be able to use cout easily
__host__ __device__ friend ostream & operator <<(ostream &s,const Vec3d2 &vec)
{
s << vec.x << endl;
s << vec.y << endl;
s << vec.z << endl;
return s;
}
//to be able to use brackets
__host__ __device__ double operator [](int i) const
{
if( i == 0)
{
return x;
}
else if( i == 1)
{
return y;
}
else if( i == 2)
{
return z;
}
else
{
cout << "ERROR IN USING VEC3D2" << endl;
system("PAUSE");
}
}
__host__ __device__ double & operator [](int i)
{
if( i == 0)
{
return x;
}
else if( i == 1)
{
return y;
}
else if( i == 2)
{
return z;
}
else
{
cout << "ERROR IN USING VEC3D2" << endl;
system("PAUSE");
}
}
//attributes
double x, y, z;
};
//Class Vec3d2 functions and operators
Vec3d2::Vec3d2()
{
x = 0;
y = 0;
z = 0;
}
Vec3d2::Vec3d2(double x_value, double y_value, double z_value)
{
x = x_value;
y = y_value;
z = z_value;
}
Vec3d2::~Vec3d2()
{
}
double Vec3d2::dot(Vec3d2 a) const
{
return x*a.x + y*a.y + z*a.z;
}
Vec3d2 Vec3d2::cross(Vec3d2 a) const
{
Vec3d2 result( y*a.z - z*a.y, x*a.z - z*a.x, x*a.y - y*a.x);
return result;
}
double Vec3d2::norm() const
{
return sqrt((double) x*x + y*y + z*z);
}
void Vec3d2::normalize()
{
double norm = this->norm();
if (norm > 0)
{
x = x/norm;
y = y/norm;
z = z/norm;
}
else //the vector has a null norm so nothing to do
{
}
}
__host__ __device__ Vec3d2 operator+(Vec3d2 const& a, Vec3d2 const& b)
{
return Vec3d2(a.x + b.x, a.y + b.y, a.z + b.z);
}
__host__ __device__ Vec3d2 operator-(Vec3d2 const& a, Vec3d2 const& b)
{
return Vec3d2(a.x - b.x, a.y - b.y, a.z - b.z);
}
__host__ __device__ Vec3d2 operator*(Vec3d2 const& a, double const& b)
{
return Vec3d2(b*a.x, b*a.y, b*a.z);
}
__host__ __device__ Vec3d2 operator*(double const& b, Vec3d2 const& a)
{
return Vec3d2(b*a.x, b*a.y, b*a.z);
}
__host__ __device__ Vec3d2 operator/(Vec3d2 const& a, double const& b)
{
return Vec3d2(a.x/b, a.y/b, a.z/b);
}
__host__ __device__ Vec3d2 operator/(double const& b, Vec3d2 const& a)
{
return Vec3d2(a.x/b, a.y/b, a.z/b);
}
__host__ __device__ bool operator==(Vec3d2 const& a, Vec3d2 const& b)
{
if(a.x == b.x && a.y == b.y && a.z == b.z)
{
return true;
}
else
{
return false;
}
}
__host__ __device__ bool operator!=(Vec3d2 const& a, Vec3d2 const& b)
{
if( a.x != b.x || a.y != b.y || a.z != b.z)
{
return true;
}
else
{
return false;
}
}
class Contact
{
public:
__host__ __device__ Contact(void);
//__host__ __device__ Contact(Contact const& ContactToCopy);
__host__ __device__ ~Contact(void);
__host__ __device__ void setContact(Grain &grain1, Grain &grain2, double overlap_value);
};
class Block
{
public:
__host__ Block(void);
__host__ Block(Block const& BlockToCopy);
__host__ __device__ ~Block(void);
__host__ __device__ Contact* getContactList() const;
__host__ __device__ Contact** getContactListPtr();
__host__ __device__ int getMaxNumberContact() const;
__host__ __device__ int getNumberContact() const;
__host__ __device__ void setContactList(Contact *ptr);
__host__ __device__ void addContact(Contact contact_value);
__host__ __device__ void clearContactList();// empty the contactList
__host__ __device__ void deleteBlockData(); //clear the memory taken by the contactList
__host__ __device__ Block& operator=(Block const& BlockToCopy);
protected:
int Id; //unique Id number for each entity double mass;
int totalNumberBlock; //same value for each block, cannot use static attribute because of cuda
Contact *contactList;
int numberContact, old_numberContact; //because there is no way to find it from the pointer contactList
int maxNumberContact; //maximum number of contact per block, we have to choose this
};
class Grain: public Block
{
public:
__host__ Grain(void);
__host__ Grain(Grain const& grainToCopy);
__host__ Grain(Vec3d2 position_value, double radius_value, double mass_value);
__host__ __device__ ~Grain(void);
__host__ __device__ Vec3d2 getPositionVec() const;
__host__ __device__ Vec3d2* getPosition() const;
__host__ __device__ Vec3d2** getPositionPtr();
__host__ __device__ int getTotalNumberGrain() const;
__host__ void setTotalNumberGrain();
__host__ __device__ void setTotalNumberGrain(int number);
__host__ __device__ void setPosition(Vec3d2 *ptr);
__host__ __device__ void setPositionVec(Vec3d2 position_value);
__host__ __device__ void deleteGrainData();
__host__ __device__ void findContactwithGrain(Grain *grainList);
__host__ __device__ Grain& operator=(Grain const& grainToCopy);
__host__ __device__ friend ostream & operator <<(ostream &s,const Grain &grain)
{
s <<"position is" << endl;
s << *grain.position << endl;
s <<"grain number is" << endl;
s << grain.number << endl;
s <<"radius is" << endl;
s << grain.radius << endl;
s <<"mass is" << endl;
return s;
}
private:
Vec3d2 *position;
int totalNumberGrain;
int number; //different from Id defined in class Block because a wall could have the same number as a grain
double radius;
};
class Analysis
{
public:
Analysis(void);
Analysis(Grain *grainList);
~Analysis(void);
Grain* getGrainList();
void copyToDevice();
void copyToHost();
void runAnalysis();
private:
//should contain grainList, wallList and their equivalent for the device
//should contain an array of pointers for each attribute being a pointer in grain and wall and their equivalent in the device
int totalNumberGrain, totalNumberWall;
Grain *grainList, *d_grainList;
//for grain data
Contact **grain_contactList, **d_grain_contactList;
Vec3d2 **grain_position, **d_grain_position;
};
//class Contact functions
Contact::Contact(void)
{
}
Contact::~Contact(void)
{
}
void Contact::setContact(Grain &grain1, Grain &grain2, double overlap_value)//we are in grain1 and contact with grain2
{
}
//class Block functions
Block::Block(void)
{
Id = global_totalNumberBlock;
numberContact = 0;
old_numberContact = 0;
//contact list settings
maxNumberContact = 30;
contactList = new Contact[maxNumberContact];
//increment of block number
global_totalNumberBlock = global_totalNumberBlock + 1;
}
Block::~Block(void)
{
delete[] contactList;
//cout << "CAREFUL, YOU ARE DESTROYING A BLOCK" << endl;//because we should never erase a block
//system("PAUSE");
totalNumberBlock = totalNumberBlock - 1;
}
Block::Block(Block const& BlockToCopy)
{
Id = BlockToCopy.Id;
numberContact = BlockToCopy.numberContact;
old_numberContact = BlockToCopy.old_numberContact;
maxNumberContact = BlockToCopy.maxNumberContact;
contactList = new Contact[maxNumberContact];
for(int i =0; i <numberContact; i++)
{
contactList[i] = BlockToCopy.contactList[i];
}
}
Contact* Block::getContactList() const
{
return contactList;
}
Contact** Block::getContactListPtr()
{
return &contactList;
}
int Block::getMaxNumberContact() const
{
return maxNumberContact;
}
int Block::getNumberContact() const
{
return numberContact;
}
void Block::setContactList(Contact *ptr)
{
//no "delete contactList" here because this is executed after cuda. The contactList is pointing to nothing and deleteing it will cause an error
contactList = ptr;
}
void Block::addContact(Contact contact_value)
{
if(numberContact < maxNumberContact)
{
contactList[numberContact] = contact_value;
numberContact = numberContact + 1;
}
else //find a way to throw an error because the list is too small for all the contacts
{
printf("TOO MANY CONTACTS ON ONE GRAIN");
}
}
void Block::clearContactList()
{
//delete[] contactList;
//contactList = new Contact[maxNumberContact];
if(numberContact > 0)
{
numberContact = 0;
}
}
void Block::deleteBlockData()
{
delete[] contactList;
}
__host__ __device__ Block& Block::operator=(Block const& BlockToCopy)
{
if(this != &BlockToCopy) //to check we are not doing a = a
{
Id = BlockToCopy.Id;
numberContact = BlockToCopy.numberContact;
old_numberContact = BlockToCopy.old_numberContact;
maxNumberContact = BlockToCopy.maxNumberContact;
delete[] contactList;
contactList = new Contact[maxNumberContact];
for(int i =0; i <numberContact; i++)
{
contactList[i] = BlockToCopy.contactList[i];
}
}
return *this;
}
//class Grain functions
Grain::Grain(void)
{
number = global_totalNumberGrain;
global_totalNumberGrain = global_totalNumberGrain + 1;
totalNumberGrain = -1;//safety
//initialize Vec3d2
position = new Vec3d2;
}
Grain::Grain(Grain const& grainToCopy)
{
cout <<"COPY CONSTRUCTOR OF GRAIN IS NOT DONE YET"<<endl;
system("PAUSE");
//totalNumberGrain = grainToCopy.totalNumberGrain;
//radius = grainToCopy.radius;
//diameter = grainToCopy.diameter;
//volume = grainToCopy.volume;
//inertia = grainToCopy.inertia;
//position = new Vec3d2;
//old_position = new Vec3d2;
//old_velocity = new Vec3d2;
//old_acceleration = new Vec3d2;
//old_angularVelocity = new Vec3d2;
//old_angularAcceleration = new Vec3d2;
//gravityForce = new Vec3d2;
//*position = *grainToCopy.position;
//*old_position = *grainToCopy.old_position;
//*old_velocity = *grainToCopy.old_velocity;
//*old_acceleration = *grainToCopy.old_acceleration;
//*old_angularVelocity = *grainToCopy.old_angularVelocity;
//*old_angularAcceleration = *grainToCopy.old_angularAcceleration;
//*gravityForce = *grainToCopy.gravityForce;
}
Grain::Grain(Vec3d2 position_value, double radius_value,double mass_value)//, number(totalNumberGrain)
{
number = global_totalNumberGrain;
global_totalNumberGrain = global_totalNumberGrain + 1;
totalNumberGrain = -1;//safety
radius = radius_value;
//initialize all the Vec3d2 parameters
position = new Vec3d2;
*position = position_value;
}
Grain::~Grain(void)
{
//cout << "CAREFUL, YOU ARE DESTROYING A GRAIN" << endl;//because we should never erase a block
//system("PAUSE");
totalNumberGrain = totalNumberGrain - 1;
delete position;
}
Vec3d2 Grain::getPositionVec() const
{
return *position;
}
Vec3d2* Grain::getPosition() const
{
return position;
}
Vec3d2** Grain::getPositionPtr()
{
return &position;
}
int Grain::getTotalNumberGrain() const
{
return totalNumberGrain;
}
void Grain::setTotalNumberGrain()
{
totalNumberGrain = global_totalNumberGrain;
}
void Grain::setTotalNumberGrain(int number)
{
totalNumberGrain = number;
}
void Grain::setPosition(Vec3d2 *ptr)
{
position = ptr;
}
void Grain::setPositionVec(Vec3d2 position_value)
{
*position = position_value;
}
void Grain::deleteGrainData()
{
delete position;
}
void Grain::findContactwithGrain(Grain *grainList)
{
for(int m = 0; m < n; m++)
{
double length;
length = (*position - (*grainList[m].position)).norm();
if( length < radius + grainList[m].radius)
{
if( number != grainList[m].number) //faster than number != sortedGrainList[m]
{
Vec3d2 relativePosition = *position - (*grainList[m].position) ;
double overlap = radius + grainList[m].radius - relativePosition.norm();
//define the contact
Contact grainContact;
grainContact.setContact(*this, grainList[m], overlap);
addContact(grainContact); //IF YOU PUT THIS LINE IN COMMENT, EVERYTHING GOES A LOT FASTER
}
}
}
}
__host__ __device__ Grain& Grain::operator=(Grain const& grainToCopy)
{
if(this != &grainToCopy)
{
Block::operator=(grainToCopy); //this lines call the operator = defined for Block. So it copies the block attributes of the first grain into the second grain
//totalNumberGrain = grainToCopy.totalNumberGrain;
radius = grainToCopy.radius;
*position = *grainToCopy.position;
}
return *this;
}
//class Analysis functions
Analysis::Analysis(void)
{
}
Analysis::Analysis(Grain *grainList_value)
{
totalNumberGrain = grainList_value[0].getTotalNumberGrain();
grainList = new Grain[totalNumberGrain];
//copy grains
for(int i = 0; i < totalNumberGrain; i++)
{
grainList[i] = grainList_value[i];
grainList[i].setTotalNumberGrain(grainList_value[i].getTotalNumberGrain());
}
}
Analysis::~Analysis(void)
{
delete[] grainList;
//a lot more delete should be made here
}
Grain* Analysis::getGrainList()
{
return grainList;
}
void Analysis::copyToDevice()
{
//declare device grainList and wallList and copy the values
cudaMalloc(&d_grainList, totalNumberGrain*sizeof(Grain));
cudaMemcpy(d_grainList, grainList, totalNumberGrain*sizeof(Grain), cudaMemcpyHostToDevice);
////declare device list of pointer to pass pointer values of grain
d_grain_contactList = new Contact*[totalNumberGrain];
d_grain_position = new Vec3d2*[totalNumberGrain];
for(int i = 0; i < totalNumberGrain; i++)
{
cudaMalloc(&d_grain_contactList[i], grainList[i].getMaxNumberContact()*sizeof(Contact));
cudaMalloc(&d_grain_position[i], sizeof(Vec3d2));
}
//copy pointers and values for grains
for(int i = 0; i < totalNumberGrain; i++)
{
//pointers
cudaMemcpy(d_grainList[i].getContactListPtr(), &d_grain_contactList[i], sizeof(Contact*), cudaMemcpyHostToDevice);
cudaMemcpy(d_grainList[i].getPositionPtr(), &d_grain_position[i], sizeof(Vec3d2*), cudaMemcpyHostToDevice);
//values
cudaMemcpy(d_grain_contactList[i], grainList[i].getContactList(), grainList[i].getMaxNumberContact()*sizeof(Contact), cudaMemcpyHostToDevice);
cudaMemcpy(d_grain_position[i], grainList[i].getPosition(), sizeof(Vec3d2), cudaMemcpyHostToDevice);
}
}
void Analysis::copyToHost()
{
//delete the pointer value or it will create a memory leak
for(int i = 0; i < totalNumberGrain; i++)
{
grainList[i].deleteBlockData();
grainList[i].deleteGrainData();
}
//copy non pointer value
cudaMemcpy(grainList, d_grainList, totalNumberGrain*sizeof(Grain),cudaMemcpyDeviceToHost);
//copy pointer values for grains
grain_contactList = new Contact*[totalNumberGrain];
grain_position = new Vec3d2*[totalNumberGrain];
for(int i = 0; i < totalNumberGrain; i++)
{
grain_contactList[i] = new Contact[grainList[i].getMaxNumberContact()];
grain_position[i] = new Vec3d2;
grainList[i].setContactList(grain_contactList[i]);
grainList[i].setPosition(grain_position[i]);
cudaMemcpy(grain_contactList[i], d_grain_contactList[i], grainList[i].getMaxNumberContact()*sizeof(Contact), cudaMemcpyDeviceToHost);
cudaMemcpy(grain_position[i], d_grain_position[i], sizeof(Vec3d2), cudaMemcpyDeviceToHost);
}
}
__global__ void compute( Grain *g)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
//__syncthreads();
if( i < n )
{
g[i].findContactwithGrain(g);
}
}
void Analysis::runAnalysis()
{
for(int i = 0; i < 3; i ++)
{
clock_t begin = clock();
for(int j = 0; j < 10000; j++)
{
compute<<<(n + 511)/512, 512>>>(d_grainList);
}
clock_t end = clock();
cout << (double)(end-begin)/CLOCKS_PER_SEC << endl;
system("PAUSE");
}
}
int main(void)
{
//grain
Vec3d2 position1; position1[0] = 0;position1[1] = 0;position1[2] = 0;
double radius1 = 1;
////cuda
cout << "PRESS ENTER TO START" << endl;
system("PAUSE");
clock_t begin = clock();
Grain *g, *d_g;
g = new Grain[n];
for(int i = 0; i<n; i++)
{
g[i].setTotalNumberGrain();
}
Grain g1(position1, radius1, 0.1);
for(int i = 0; i <n; i++)
{
g[i] = g1;
g[i].setPositionVec(Vec3d2(3*i+1.5, 1.5, 0));
}
Analysis a(g);
a.copyToDevice();
a.runAnalysis();
clock_t end = clock();
cout << (double)(end-begin)/CLOCKS_PER_SEC << endl;
return 0;
}
答案 0 :(得分:2)
我需要更多代码来验证,但最可能的解释是,当您不包含代码时,实际上并没有将任何数据写入全局内存。当你不向全局内存写任何东西时,nvcc会优化几乎所有东西,直到你什么都不会运行。
包含print语句时也是如此。打印语句被视为输出,因此nvcc无法编译出它所依赖的代码。
例如:
__global__ empty_kernel(int* huge_array, int num_elements){
int local_memory;
for(int i=0; i<num_elements; i++){
local_memory+=huge_array[i];
}
}
的运行速度会快于:
__global__ empty_kernel(int* small_array, int num_elements, int* smaller_array){
int tid = ThreadIdx.x+BlockIdx.x*BlockDim.x;
int local_memory;
for(int i=0; i<5; i++){
local_memory+=huge_array[tid*i];
}
smaller_array[tid]=local_memory;
}
最重要的是,你的第一个内核并不快,它只是没有被运行。
答案 1 :(得分:1)
我认为问题只是if
语句,而不是它有条件执行的语句。有条件的分支在GPU体系结构上可能相当昂贵(虽然它似乎在更新的体系结构中变得更好),并且只有一个分支语句肯定会减慢你的代码。
如果删除if
子句中的语句,编译器会发现没有剩下代码,因此也可以优化if
本身。所以这就是你删除这行代码时看到加速的原因。