Question

因此，我正在研究CUDA和动态并行性。由于这几乎是第一个，所以有很多文章可供阅读和发现，但是到目前为止，我还没有找到以下问题的来源。

我有以下方法，在其中启动父内核。

template <typename DataType>
void SpGEMM(const dCSR<DataType>& mat_A, const dCSR<DataType>& mat_B, dCSR<DataType>& mat_C)
{
    int *mat_A_CurrentRowindex = NULL; //declare pointer
    cudaMalloc(&mat_A_CurrentRowindex, mat_A.rows * sizeof(int));


    int blockSize = 1024;
    int gridSize = ceil(mat_A.rows / 1024.0f);

    d_expand << <gridSize, blockSize >> > (mat_A.row_offsets, mat_A.col_ids, mat_A.data,
        mat_B.row_offsets, mat_B.col_ids, mat_B.data,
        mat_A.rows, mat_A.cols, mat_B.cols, mat_A.nnz, mat_B.nnz, mat_A_CurrentRowindex);

    cudaDeviceSynchronize();
    cudaFree(mat_A_CurrentRowindex);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        printf("Error: %s\n", cudaGetErrorString(err));
    return;
}

这是父内核：

template <typename DataType>
__global__ void d_expand(const unsigned int* __restrict matA_row_offsets,
    const unsigned int* __restrict matA_col_ids,
    const DataType* __restrict matA_data,
    const unsigned int* __restrict matB_row_offsets,
    const unsigned int* __restrict matB_col_ids,
    const DataType* __restrict matB_data,
    size_t matA_rows,
    size_t matA_cols,
    size_t matB_cols,
    size_t matA_nnz,
    size_t matB_nnz,
    unsigned int* __restrict output_row_offsets,
    unsigned int* __restrict output_mat_col_ids,
    DataType* __restrict output_mat_data,
    int* currentRowIndex)
{
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index >= matA_rows) return;

    int A_rowsize = matA_row_offsets[index + 1] - matA_row_offsets[index];
    auto blockSize = 1024;
    int gridSize = ceil(A_rowsize / 1024.0f);
    currentRowIndex[index] = index;
    d_expand2 << <gridSize, blockSize >> > (matA_row_offsets, matA_col_ids, matA_data, matB_row_offsets, matB_col_ids, matB_data, matA_rows, matA_cols, matB_cols, matA_nnz, matB_nnz, currentRowIndex[index]);

    return;
}

还有子内核：

template <typename DataType>
__global__ void d_expand2(const unsigned int* __restrict matA_row_offsets,
    const unsigned int* __restrict matA_col_ids,
    const DataType* __restrict matA_data,
    const unsigned int* __restrict matB_row_offsets,
    const unsigned int* __restrict matB_col_ids,
    const DataType* __restrict matB_data,
    size_t matA_rows,
    size_t matA_cols,
    size_t matB_cols,
    size_t matA_nnz,
    size_t matB_nnz,
    unsigned int* __restrict output_row_offsets,
    unsigned int* __restrict output_mat_col_ids,
    DataType* __restrict output_mat_data,
    int currentRowIndex)
{
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    int rowLength = matA_row_offsets[currentRowIndex + 1] - matA_row_offsets[currentRowIndex];
    if (index >= rowLength) return;

    auto matA_colidx = matA_col_ids[matA_row_offsets[currentRowIndex] + index];
    auto B_rowsize = matB_row_offsets[matA_colidx + 1] - matB_row_offsets[matA_colidx];
    float C_buffer;

    for (auto i = 0; i < B_rowsize; i++)
    {
        C_buffer = matA_data[matA_row_offsets[currentRowIndex] + index] * matB_data[matB_row_offsets[index + 1] + i];
        //printf("%f,(%d, %d, %f) \n", matA_data[matA_row_offsets[currentRowiIndex] + index], currentRowiIndex, i, C_buffer[i]);
    }

    return;
}

一些上下文：这仅应使用动态并行度来模拟稀疏矩阵乘法的扩展部分。这意味着我只想打印扩展矩阵的单个乘法结果，而不保存任何一个。输入矩阵A和B取自https://sparse.tamu.edu，其中矩阵与其自身（或者如果不是方阵，则换位）相乘。矩阵以CSR格式存储。

问题： 在不超过一定矩阵大小的情况下，此方法效果很好。例如，我使用以下矩阵对此进行了测试：

https://sparse.tamu.edu/JGD_Kocay/Trec4

https://sparse.tamu.edu/JGD_Kocay/Trec5

https://sparse.tamu.edu/HB/1138_bus

但是，尝试更大的矩阵时会出现异常，并输出以下CUDA错误：

遇到非法的内存访问

我的猜测是嵌套的内核启动太多，但是我只能找到每个块的最大数量或网格大小和线程上的资源，而通常找不到最大数量的资源。

CUDA：具有动态并行处理功能的已启动线程的最大数量

0 个答案: