目前我正致力于将用cpu C ++编写的分子动力学模拟程序移植到Cuda。简而言之,程序初始化一个原子列表,将控制转移到类CCalc
的对象,该对象计算100
(或其他数量)迭代的原子力,速度和位置,最后返回到在屏幕上绘制原子。
我的目标是让CCalc
中的所有计算密集型函数在gpu上运行。为了防止必须逐个复制CCalc
中的所有计算常量,我决定将整个类复制到设备内存,由this__d
指向。由于从cpu调用绘图函数,因此需要在每次100
次迭代时在cpu和gpu之间复制原子列表,因此不是CCalc
的成员。
在函数CCalc::refreshCellList()
中,我想重新排列atoms__d
(驻留在设备内存中的原子列表),以便将同一单元格中的所有原子组合在一起。换句话说,atoms__d
需要以cellId
作为键进行排序。
由于我不想浪费时间实施自己的排序算法,我尝试使用thrust::sort_by_key()
。在这里我遇到了困难。函数thrust::sort_by_key()
需要device_ptr
个对象作为参数;但是我无法访问cellId
,因为我只能将this__d
投射到device_ptr
,我无法在cpu上取消引用。
有没有办法做到这一点,而不必打破gpu"上的"类;结构
以下是我的代码(摘录):
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <vector>
#include <thrust\sort.h>
#include <thrust\device_ptr.h>
#define REFRESH_CELL_LISTS 20
struct Atom
{
float pos[3];
float vel[3];
float force[3];
// others
};
std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;
class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);
class CCalc
{
private:
CCalc *this__d;
public:
const int nAtoms = noOfAtoms;
int *cellId;
const int nCellX = 4, nCellY = 3;
// many force calculation constants
CCalc()
{
cudaMalloc((void**)&cellId, nAtoms*sizeof(int));
// some other stuff
cudaMalloc((void**)&this__d, sizeof(CCalc));
cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
}
// destructor
void relaxStructure(int numOfIterations)
{
cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);
for(int iter = 0; iter < numOfIterations; iter++)
{
// stuff
if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
// calculate forces; update velocities and positions
}
cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
cudaFree(atom__d);
}
// functions for force, velocity and position calculation
void refreshCellLists()
{
makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
cudaDeviceSynchronize();
// sort atom__d array using cellId as keys;
// here is where I would like to use thrust::sort_by_key()
}
};
__global__ void makeCells(CCalc *C, Atom *A)
{
int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index < C->nAtoms)
{
// determine cell x, y based on position
// for now let's use an arbitrary mapping to obtain x, y
int X = (index * index) % C->nCellX;
int Y = (index * index) % C->nCellY;
C->cellId[index] = X + Y * C->nCellX;
}
}
int main()
{
cudaSetDevice(0);
noOfAtoms = 1000; // normally defined by input file
atom.resize(noOfAtoms);
// initialise atom positions, velocities and forces
CCalc calcObject;
while(true) // as long as we need
{
// draw atoms on screen
calcObject.relaxStructure(100);
}
}
非常感谢。
答案 0 :(得分:0)
换句话说,atoms__d需要用cellId作为键进行排序。
应该可以在refreshCellLists
方法中指定的位置执行此操作。为简单起见,我选择直接使用原始设备指针(尽管我们可以轻松地将这些原始设备指针包装在thrust::device_ptr
中)并结合thrust::device
执行策略。这是一个有效的例子:
$ cat t1156.cu
#include <vector>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#define REFRESH_CELL_LISTS 20
struct Atom
{
float pos[3];
float vel[3];
float force[3];
// others
};
std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;
class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);
class CCalc
{
private:
CCalc *this__d;
public:
const int nAtoms = noOfAtoms;
int *cellId;
const int nCellX = 4, nCellY = 3;
// many force calculation constants
CCalc()
{
cudaMalloc((void**)&cellId, nAtoms*sizeof(int));
// some other stuff
cudaMalloc((void**)&this__d, sizeof(CCalc));
cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
}
// destructor
void relaxStructure(int numOfIterations)
{
cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);
for(int iter = 0; iter < numOfIterations; iter++)
{
// stuff
if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
// calculate forces; update velocities and positions
}
cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
cudaFree(atom__d);
}
// functions for force, velocity and position calculation
void refreshCellLists()
{
makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
cudaDeviceSynchronize();
// sort atom__d array using cellId as keys;
thrust::sort_by_key(thrust::device, cellId, cellId+nAtoms, atom__d);
}
};
__global__ void makeCells(CCalc *C, Atom *A)
{
int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index < C->nAtoms)
{
// determine cell x, y based on position
// for now let's use an arbitrary mapping to obtain x, y
int X = (index * index) % C->nCellX;
int Y = (index * index) % C->nCellY;
C->cellId[index] = X + Y * C->nCellX;
}
}
int main()
{
cudaSetDevice(0);
noOfAtoms = 1000; // normally defined by input file
atom.resize(noOfAtoms);
// initialise atom positions, velocities and forces
CCalc calcObject;
for (int i = 0; i < 100; i++) // as long as we need
{
// draw atoms on screen
calcObject.relaxStructure(100);
}
}
$ nvcc -std=c++11 -o t1156 t1156.cu
$ cuda-memcheck ./t1156
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
在构建推力代码时,特别是在窗口上,我通常会按照here汇总提出一系列建议。