Question

我正在尝试执行以下操作（简化）：请阅读编辑部分！

__shared__ int currentPos = 0;
__global__ myThreadedFunction(float *int, float *out)
{
    // do calculations with in values
    ...

    // now first thread reach this:
    //suspend other threads up here

    out += currentPos;
    for (int i = 0; i < size; ++i)
    {
        *(currentPos++) =  calculation[i];
    }
    currentPos +=  size;

    // now thread is finish, other threads can
    // go on with writing
}

那么如何在写入相同内存之前挂起线程？我不能同时写，因为我不知道每个calculatet数组的大小（calculate [i] - size）。

我知道有 syncthreads 和 threadfence ，但我不知道如何正确使用它们来解决这个问题。

修改我想做的是：

我有2个线程（例如）。每个线程都使用float *在一个新数组中进行计算。

线程1计算： {1,3,2,4}

线程2计算： {3,2,5,6,3,4}

计算后知道这些数组的大小。现在我想在float * out中编写这些数组。

如果第一个线程1或第2个线程正在写入，则没有必要。输出可能是： * {1,3,2,4,3,2,5,6,3,4}或{3,2,5,6,3,4,1,3,2,4} *

那么如何计算输出数组的位置？

我不想使用固定的“数组大小”，以便输出为： * {1,3,2,4，？，？，3,2,5,6,3,4} *

我想我可以为下一个写作位置提供共享变量POSITION。

线程1到达写入点（计算新数组后）。线程1在共享变量POSITION中写入他的数组大小（4）。

当线程1正在将他的临时数组写入输出数组时，线程2读取变量POSITION并添加他的tmp。数组大小（6）到这个变量并开始写入在线程1结束的位置

如果有一个主题3，他还会读取POSITION，添加他的数组大小并写入输出，其中主题2结束

所以任何人都有想法？

Answer 1

从概念上讲，如何使用共享数组为每个线程存储索引来进行并发输出。

__global__ myThreadedFunction(float *int, float *out)
{

    __shared__ index[blockDim.x];//replace the size with an constant
    // do calculations with in values
    ...



    index[tid] = size;// assuming size is the size of the array you output
    //you could do a reduction on this for loop for better performance.
    for(int i = 1; i < blockDim.x; ++i) {
        __syncthreads();
        if(tid == i) {
            index[tid] += index[tid-1];
        }
    }
    int startposition = index[tid] - size; // you want to start at the start, not where the index ends

    //do your output for all threads concurrently where startposition is the first index you output to

}

所以你要做的是将index[tid]分配给你想要输出的大小，其中tid是线程索引threadIdx.x，然后对数组进行求和（增加索引），然后最后index[tid]是来自线程0的输出数组中的偏移起始索引。可以使用简化来轻松完成求和。

Answer 2

此代码按预期工作。它同时阅读input[]。对于每个输入元素 size，按照size中存储的顺序，size次result次input[]次{。}}。

请注意，写入过程可能比在CPU上花费更长的时间。由于您已经知道要写入的每个线程的数据大小，因此您可能希望先使用parallel prefix sum计算每个线程的写入位置，然后同时写入数据。

有关代码中使用的__threadfence()的详细信息，请参阅Memory Fence Functions。

#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>

volatile __device__ int count = 0;
volatile __device__ int pos = 0;
__global__ void serial(const float* input, const int N, float* result)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;

    //parallel part
    int size = (int) input[id];

    //serial output
    for (int i = 0; i < N; i++)
    {
        int localcount = count;
        if (localcount == id)
        {
            int localpos = pos;
            for (int j = 0; j < size; j++)
            {
                result[localpos + j] = (float) j + 1;
            }
            pos = localpos + size;
            count = localcount + 1;
            __threadfence();
        }
        while (count == localcount)
        {
            __syncthreads();
        };

    }
}

int main()
{
    int N = 6;
    thrust::device_vector<float> input(
            thrust::counting_iterator<float>(1),
            thrust::counting_iterator<float>(1) + N);

    thrust::device_vector<float> result(N * (N + 1) / 2);
    serial<<<2, 3>>>(
            thrust::raw_pointer_cast(&input[0]),
            N,
            thrust::raw_pointer_cast(&result[0]));

    thrust::copy(
            result.begin(), result.end(),
            std::ostream_iterator<float>(std::cout, " "));

    return 0;

}

按预期输出：

1 1 2 1 2 3 1 2 3 4 1 2 3 4 5 1 2 3 4 5 6

CUDA - 同步线程 - ＆gt;等到第一次完成写作

2 个答案: