Question

我正在尝试在CUDA中并行处理矩阵。我需要针对给定的向量计算矩阵的每一列，如果结果大于某个值，我将保留此列，否则将删除该列以进行进一步计算。为避免复制和重构矩阵，我使用列索引来指示是否应该使用列进行进一步计算。

此过程需要多次完成。每次需要检查所有列的子集。所以我创建了另一个矩阵来存储每次处理的列索引。例如，如果我有一个10列的矩阵，我需要重复此过程4次，column_indices矩阵可能如下所示：

thrust::device_vector<int> column_indices( std::vector<int>( {
    0, 1, -1, -1, -1,   // 2 columns contains useful information
    5, 6, 7, -1, -1,    // 3 columns contains useful information
    9, 8, 7, 6, -1,     // 4 columns contains useful information
    4, 3, 2, 1, 0       // 5 columns contains useful information
} ) );

这只是一个简化的例子。在实际代码中，我必须处理大约500-1000列的矩阵。因为并非每次都需要处理所有列并且列数很大，所以将每个列传递给线程进行处理可能不是一个好主意，因为这意味着可能有一半的线程将处于空闲状态。

所以我决定使用动态并行性 - 父内核检查需要多少线程来处理和启动具有确切线程数的子内核，并根据需要分配精确的共享内存。

这是我的代码：

#include <iostream>
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>

__device__
float calculate( const float* v1, const float* v2, const int length )
{
    // mock calculation resulting 0.0 for even threads and 0.5 for odd threads
    return threadIdx.x % 2 == 0 ? 0.0f : 0.5f;
}

__global__
void child( float const* input_a, const int nrow, float const* input_b, int* columns, int* counts )
{
    extern __shared__ float results[];

    // input_a are a matrix stored in column-major order, and input_b is a vector
    int thread_column = columns[ threadIdx.x ];
    float const* thread_input = input_a+ thread_column * nrow;
    results[ threadIdx.x ] = calculate( thread_input, input_b, nrow );
    //--------------Discussion-----------
    //Race condition is gone if I replace the line above with this:
    //atomicExch( results + threadIdx.x, calculate( thread_input, input_b, nrow ) );
    //However it looks to me unnecessary as each thread is accessing a different address
    //-----------------------------------
    __syncthreads();

    if ( threadIdx.x == 0 ) {
        // sort the column indices in descending results order so all indices to be removed are at the end of the indices
        thrust::sort_by_key( thrust::seq, results, results + blockDim.x, columns, thrust::greater<float>() );
        // count the number of indices to be removed
        int remove_count = thrust::count( thrust::seq, results, results + blockDim.x, 0.0f );
        *counts -= remove_count;
    }
}

__global__
void parent( float const* inputs, const int nrow, float const* output, int* column_indices, int* column_counts, const int column_size )
{
    int row_per_group = blockDim.x;
    int group_num = blockIdx.x, row_num = threadIdx.x;
    int tid = group_num * row_per_group + row_num;

    int* indices_for_this_block = column_indices + tid * column_size;
    int* count_for_this_block = column_counts + tid;
    // launch child kernels to process the row
    int block_size = *count_for_this_block;
    if ( block_size > 0 ) {
        child<<< 1, block_size, sizeof( float ) * block_size >>>( inputs, nrow, output, indices_for_this_block, count_for_this_block );
        cudaDeviceSynchronize();
    }
}

int main()
{
    thrust::device_vector<int> column_indices( std::vector<int>( {
        0, 1, -1, -1, -1,   // 2 columns contains useful information
        5, 6, 7, -1, -1,    // 3 columns contains useful information
        9, 8, 7, 6, -1,     // 4 columns contains useful information
        4, 3, 2, 1, 0       // 5 columns contains useful information
    } ) );

    thrust::device_vector<int> column_count( std::vector<int>( { 2, 3, 4, 5 } ) );

    // Processing column_indices in two groups and each group process two rows
    // Because we are mocking the correlation results, we don't need real data, so we pass nullptr as the data pointer.
    parent<<< 2, 2 >>>(
        nullptr, 0, nullptr, column_indices.data().get(), column_count.data().get(), 5
    );
    //--------------Discussion-----------
    // Race condition is also gone if I launch parent kernel like this:
    //parent<<< 2, 2, sizeof( float ) * 5 >>>(
    //    nullptr, 0, nullptr, column_indices.data().get(), column_count.data().get(), 5
    //);
    // But when the total number of column is big, this approach will fail as it exceeds the maximum capacity of shared memory
    // (although only a fraction of the allocation is actually used).
    //-----------------------------------
    cudaDeviceSynchronize();

    std::cout << "Row #0: ";
    std::copy( column_indices.begin(), column_indices.begin() + column_count[ 0 ], std::ostream_iterator<int>( std::cout, ", " ) );
    std::cout << std::endl;

    std::cout << "Row #1: ";
    std::copy( column_indices.begin() + 5, column_indices.begin() + 5 + column_count[ 1 ], std::ostream_iterator<int>( std::cout, ", " ) );
    std::cout << std::endl;

    std::cout << "Row #2: ";
    std::copy( column_indices.begin() + 10, column_indices.begin() + 10 + column_count[ 2 ], std::ostream_iterator<int>( std::cout, ", " ) );
    std::cout << std::endl;

    std::cout << "Row #3: ";
    std::copy( column_indices.begin() + 15, column_indices.begin() + 15 + column_count[ 3 ], std::ostream_iterator<int>( std::cout, ", " ) );
    std::cout << std::endl;
}

运行上面的代码，我得到了正确的结果：

Row #0: 1,
Row #1: 6,
Row #2: 8, 6,
Row #3: 3, 1,

然而，cuda-memcheck似乎抱怨这样的潜在竞争条件：

========= WARN:(Warp Level Programming) Potential RAW hazard detected at __shared__ 0x13 in block (0, 0, 0) :
=========     Write Thread (4, 0, 0) at 0x00000070 in /path_to_file/main.cu:23:child(float const *, int, float const *, int*, int*)
=========     Read Thread (0, 0, 0) at 0x00000648 in /usr/local/cuda/include/thrust/system/detail/sequential/insertion_sort.h:109:child(float const *, int, float const *, int*, int*)
=========     Current Value : 0

main.cu中的第23行是这一行：

results[ threadIdx.x ] = calculate( thread_input, input_b, nrow );

并且阅读主题似乎与：

有关

thrust::sort_by_key( thrust::seq, results, results + blockDim.x, columns, thrust::greater<float>() );

但为什么会在由__syncthreads（）分隔的两行之间发生？

我不明白为什么会这样。

在这个例子中，每个子块最多只有5个线程。
在让线程0处理计算结果之前，我调用了__syncthreads()。
我的理解是共享内存对每个块都是私有的（也许这就是问题的来源）。因此，子内核的多次启动不应相互干扰。
如果我稍微修改我的代码（如代码中的讨论部分所述），我可以删除竞速条件。但是为什么这些工作而另一个却没有？

有谁能告诉我我做错了什么？非常感谢你！

Answer 1

此时（通过CUDA 8.0），cuda-memcheck竞赛检查工具does not support dynamic parallelism。

__syncthreads（）

1 个答案: