Question

目前我正致力于将用cpu C ++编写的分子动力学模拟程序移植到Cuda。简而言之，程序初始化一个原子列表，将控制转移到类CCalc的对象，该对象计算100（或其他数量）迭代的原子力，速度和位置，最后返回到在屏幕上绘制原子。

我的目标是让CCalc中的所有计算密集型函数在gpu上运行。为了防止必须逐个复制CCalc中的所有计算常量，我决定将整个类复制到设备内存，由this__d指向。由于从cpu调用绘图函数，因此需要在每次100次迭代时在cpu和gpu之间复制原子列表，因此不是CCalc的成员。

在函数CCalc::refreshCellList()中，我想重新排列atoms__d（驻留在设备内存中的原子列表），以便将同一单元格中的所有原子组合在一起。换句话说，atoms__d需要以cellId作为键进行排序。

由于我不想浪费时间实施自己的排序算法，我尝试使用thrust::sort_by_key()。在这里我遇到了困难。函数thrust::sort_by_key()需要device_ptr个对象作为参数;但是我无法访问cellId，因为我只能将this__d投射到device_ptr，我无法在cpu上取消引用。

有没有办法做到这一点，而不必打破gpu＆＃34;上的＆＃34;类;结构

以下是我的代码（摘录）：

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <vector>
#include <thrust\sort.h>
#include <thrust\device_ptr.h>

#define REFRESH_CELL_LISTS 20

struct Atom
{
    float pos[3];
    float vel[3];
    float force[3];
    // others
};

std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;

class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);

class CCalc
{
private:
    CCalc *this__d;
public:
    const int nAtoms = noOfAtoms;
    int *cellId;
    const int nCellX = 4, nCellY = 3;
    // many force calculation constants

    CCalc()
    {
        cudaMalloc((void**)&cellId, nAtoms*sizeof(int));

        // some other stuff

        cudaMalloc((void**)&this__d, sizeof(CCalc));
        cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
    }

    // destructor

    void relaxStructure(int numOfIterations)
    {
        cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
        cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);

        for(int iter = 0; iter < numOfIterations; iter++)
        {
            // stuff
            if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
            // calculate forces; update velocities and positions
        }

        cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
        cudaFree(atom__d);
    }

    // functions for force, velocity and position calculation

    void refreshCellLists()
    {
        makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
        cudaDeviceSynchronize();
        // sort atom__d array using cellId as keys;
        // here is where I would like to use thrust::sort_by_key()
    }
};

__global__ void makeCells(CCalc *C, Atom *A)
{
    int index = blockDim.x*blockIdx.x + threadIdx.x;
    if(index < C->nAtoms)
    {
        // determine cell x, y based on position
        // for now let's use an arbitrary mapping to obtain x, y
        int X = (index * index) % C->nCellX;
        int Y = (index * index) % C->nCellY;

        C->cellId[index] = X + Y * C->nCellX;
    }
}

int main()
{
    cudaSetDevice(0);

    noOfAtoms = 1000; // normally defined by input file
    atom.resize(noOfAtoms);

    // initialise atom positions, velocities and forces

    CCalc calcObject;

    while(true) // as long as we need
    {
        // draw atoms on screen
        calcObject.relaxStructure(100);
    }
}

非常感谢。

Answer 1

换句话说，atoms__d需要用cellId作为键进行排序。

应该可以在refreshCellLists方法中指定的位置执行此操作。为简单起见，我选择直接使用原始设备指针（尽管我们可以轻松地将这些原始设备指针包装在thrust::device_ptr中）并结合thrust::device执行策略。这是一个有效的例子：

$ cat t1156.cu
#include <vector>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>

#define REFRESH_CELL_LISTS 20

struct Atom
{
    float pos[3];
    float vel[3];
    float force[3];
    // others
};

std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;

class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);

class CCalc
{
private:
    CCalc *this__d;
public:
    const int nAtoms = noOfAtoms;
    int *cellId;
    const int nCellX = 4, nCellY = 3;
    // many force calculation constants

    CCalc()
    {
        cudaMalloc((void**)&cellId, nAtoms*sizeof(int));

        // some other stuff

        cudaMalloc((void**)&this__d, sizeof(CCalc));
        cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
    }

    // destructor

    void relaxStructure(int numOfIterations)
    {
        cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
        cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);

        for(int iter = 0; iter < numOfIterations; iter++)
        {
            // stuff
            if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
            // calculate forces; update velocities and positions
        }

        cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
        cudaFree(atom__d);
    }

    // functions for force, velocity and position calculation

    void refreshCellLists()
    {
        makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
        cudaDeviceSynchronize();
        // sort atom__d array using cellId as keys;
        thrust::sort_by_key(thrust::device, cellId, cellId+nAtoms, atom__d);
    }
};

__global__ void makeCells(CCalc *C, Atom *A)
{
    int index = blockDim.x*blockIdx.x + threadIdx.x;
    if(index < C->nAtoms)
    {
        // determine cell x, y based on position
        // for now let's use an arbitrary mapping to obtain x, y
        int X = (index * index) % C->nCellX;
        int Y = (index * index) % C->nCellY;

        C->cellId[index] = X + Y * C->nCellX;
    }
}

int main()
{
    cudaSetDevice(0);

    noOfAtoms = 1000; // normally defined by input file
    atom.resize(noOfAtoms);

    // initialise atom positions, velocities and forces

    CCalc calcObject;

    for (int i = 0; i < 100; i++) // as long as we need
    {
        // draw atoms on screen
        calcObject.relaxStructure(100);
    }
}
$ nvcc -std=c++11 -o t1156 t1156.cu
$ cuda-memcheck ./t1156
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

在构建推力代码时，特别是在窗口上，我通常会按照here汇总提出一系列建议。

Thrust - 在gpu上对类对象的成员数组进行排序

1 个答案: