Question

我正在构建一个n-body模拟，使用CUDA来提高性能。我目前正在进一步并行化系统，因此粒子之间的每个交互都在一个单独的线程上运行。这将理论复杂度降低到1，仅受GPU速度的限制。

为了实现这一目标，我尝试使用(N/T, N/T)网格和T*T块（其中T是块）使用N ^ 2块块（其中N为粒子数）来启动内核每个块的线程数）。我已经能够启动一个N * N网格，但每当我尝试使用多维块（线程）时内核崩溃：

error code invalid configuration arguments

这是T = 512和N = 5000但是将这些减少到T = 128并且N = 1000没有效果。以下是一些规格和代码：

Cuda SDK版本：7.5

GPU：GTX 970 4gb

CC版本：5.2

在Windows 7中的MSVS 2013 64位编译

内核启动代码

dim3 block(TPB, TPB);
dim3 grid;
grid.x = (numParticles + TPB - 1) / TPB;
grid.y = (numParticles + TPB - 1) / TPB;

doParticles<<< grid, block >>>(d_pos, d_vel, d_acc, d_mass, numParticles, dt);

如何更改此代码以实现目标？

我可以发布一些内核代码等但不认为它很重要，因为内核甚至没有启动。如果有任何其他信息有用，请告诉我。

提前致谢。

编辑：

MCVE

主要

#define TPB 32
....

unsigned int numParticles = 1000;
p_type* h_pos;
p_type* h_vel;
p_type* h_acc;
p_type* h_mass;

p_type* d_pos;
p_type* d_vel;
p_type* d_acc;
p_type* d_mass;


int pointsPerParticleVec = 3;
size_t size = sizeof(p_type) * 3 * numParticles;

h_pos = (p_type*)malloc(size);
h_vel = (p_type*)malloc(size);
h_acc = (p_type*)malloc(size);
h_mass = (p_type*)malloc(size / 3);

d_pos = NULL;
d_vel = NULL;
d_acc = NULL;

cudaError_t err = cudaSuccess;
//allocate space on GPU
err = cudaMalloc((void **)&d_pos, size);
err = cudaMalloc((void **)&d_vel, size);
err = cudaMalloc((void **)&d_acc, size);
err = cudaMalloc((void **)&d_mass, size / 3);

//nothing really matters for this example just making sure no gargage values happen
for (int partIt = 0; partIt < numParticles; partIt++)
{
    int index = partIt * 3;

    h_pos[index] = 0;
    h_pos[index + 1] = 0;
    h_pos[index + 2] = 0;

    h_vel[index] = 0;
    h_vel[index + 1] = 0;
    h_vel[index + 2] = 0;

    h_acc[index] = 0;
    h_acc[index + 1] = 0;
    h_acc[index + 2] = 0;

    h_mass[partIt] = 0;
}

err = cudaMemcpy(d_pos, h_pos, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_vel, h_vel, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_acc, h_acc, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_mass, h_mass, size / 3, cudaMemcpyHostToDevice);

while (true)    //display functionality removed for now
{
    //do calculations
    float dt = .1;
    dim3 block(TPB, TPB);
    dim3 grid;
    grid.x = (numParticles + TPB - 1) / TPB;
    grid.y = (numParticles + TPB - 1) / TPB;

    doParticles << < grid, block >> >(d_pos, d_vel, d_acc, d_mass, numParticles, dt);   //<<<<<<<<<<<<here is where it does not launch

    err = cudaGetLastError();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));   //see the error pop up here
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    int numBlocks2 = (numParticles * 3 + TPB - 1) / TPB;

    //add acceleration to velocity
    ARR_ADD << <numBlocks2, TPB >> >(d_vel, d_acc, numParticles * 3);

    cudaDeviceSynchronize();
    //reset acceleration vector 
    ARR_SET << <numBlocks2, TPB >> >(d_acc, 0.0f, numParticles * 3);

    //add velocity to position
    POS_ADD << <numBlocks2, TPB >> >(d_pos, d_vel, numParticles * 3, dt);

    //copy vector back to cpu (until opengl-cuda gets implemented)
    cudaMemcpy(h_pos, d_pos, sizeof(p_type) * 3 * numParticles, cudaMemcpyDeviceToHost);
}

内核

__device__ float fInvSqrt_D(const float& in)
{
    long i;
    float x2, y;
    const float threehalfs = 1.5F;

    x2 = in * 0.5F;
    y = in;
    i = *(long *)&y;
    i = 0x5f3759df - (i >> 1);
    y = *(float *)&i;
    y = y * (threehalfs - (x2 * y * y));
    y = y * (threehalfs - (x2 * y * y));    //extra precision
    return abs(y);
}

__global__ void POS_ADD(p_type* getter, const p_type *giver, int N, float dt)
{
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index < N)
    {
        getter[index] = getter[index] + (giver[index]*dt);
    }

}

__global__ void ARR_ADD(p_type* getter, const p_type *giver, int N)
{
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index < N)
    {
        getter[index] = getter[index] + giver[index];
    }

}

__global__ void ARR_SET(p_type* getter, const p_type value, int N)
{
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index < N)
    {
        getter[index] = value;
    }
}



__global__ void doParticles(p_type* pos, p_type* vel, p_type* acc, p_type* mass, int numParticles, float tstep)
{
    int blockId = blockIdx.y * gridDim.x + blockIdx.x;
    int threadIndex = blockId * blockDim.x + threadIdx.x;

    int cRowPos = blockId % gridDim.y;
    int notInRow = blockId - cRowPos;

    int Y = blockId/gridDim.x + threadIdx.y;    //slower changing iterator
    int X = threadIndex - ((notInRow) * TPB);   //fast changing iterator

    int pIndex1 = X * 3;
    int pIndex2 =Y * 3;

    if (pIndex1 != pIndex2 && Y < numParticles)
    {

        p_type diffx = (pos[pIndex1] - pos[pIndex2]);           //calculating difference between points
        p_type diffy = (pos[pIndex1 + 1] - pos[pIndex2 + 1]);
        p_type diffz = (pos[pIndex1 + 2] - pos[pIndex2 + 2]);

        p_type distsqr = diffx*diffx + diffy*diffy + diffz*diffz;

        if (distsqr < 0)
        {
            distsqr *= -1;
        }
        if (distsqr < 500)
        {
            distsqr = 500;
        }

        p_type attraction = (mass[X] * mass[Y]) / (distsqr);    //gravity equation


        p_type invsqrt = fInvSqrt_D((float)distsqr);
        p_type normx = invsqrt*diffx;
        p_type normy = invsqrt*diffy;
        p_type normz = invsqrt*diffz;

        p_type forcex = normx * -attraction;
        p_type forcey = normy * -attraction;
        p_type forcez = normz * -attraction;

        acc[pIndex1] += (forcex * tstep) / mass[X];
        acc[pIndex1 + 1] += (forcey * tstep) / mass[X];
        acc[pIndex1 + 2] += (forcez * tstep) / mass[X];


    }
}

是的，我知道doParticle内核中的索引已经破解。我计划修复它启动的那个。：）

再次感谢。

Answer 1

CUDA线程块最多限制为1024个线程，块中的总线程数是线程块尺寸的乘积：

dim3 block(TPB, TPB);

因此，任何大于32的TPB值都不会起作用，当您尝试启动任何此类内核时，您将收到无效的配置参数错误。

因此，将T或TPB减少为32，您应该可以启动内核。

在Cuda崩溃中获得2d线程

内核启动代码

MCVE

1 个答案: