Question

我试图更好地了解cuda内核中的动态共享内存。我复制了此example并对其进行了修改，以允许我更改工作项的数量。它适用于<= 1024，更大的元素，并且结果数组d中的所有元素均为0。此外，它报告以下错误“遇到非法内存访问”。我很茫然，无法找到错误。对于我做错的任何帮助/解释将不胜感激。谢谢。

代码在讨论过程中已更新

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>

#define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
        if (code != cudaSuccess) {
                fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
                if (abort) exit(code);
        }
}

__global__ void dynamicAddNine(int *d, int n) {
        extern __shared__ int s[];
        int t = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
        s[t] = d[t] + 9;
        __syncthreads();
        d[t] = s[t];
}

static void parseArgs(int argc, char **argv) {
        if (argc != 2) {
                printf("Usage : %s <N>, where N (+ve int) is the required array size\n", argv[0]);
                exit(1);
        }
        if (atoi(argv[1]) < 0 ) {
                printf("Usage : %s <N>, where N (+ve int) is the required array size\n", argv[0]);
                exit(1);
        }
}

int main(int argc, char *argv[]) {
        parseArgs(argc, argv);
        const int n = atoi(argv[1]);
        int T = 1024; // maximum number of threads
        if (n < 1024) { T = n;} // if no elements required < 1024

        int a[n], r[n], d[n];

        for (int i = 0; i < n; i++) {
                a[i] = i;
                r[i] = i+9;
                d[i] = 0;
        }

        int *d_d;
        gpuErrChk(cudaMalloc(&d_d, n*sizeof(int)));
        int B = (int)ceil((float)n/(float)T);  //calculate no of blocks required

        // run version with dynamic shared memory
        std::cout << "B: " << B << ",  T: " << T << std::endl;

        gpuErrChk(cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice));
        dynamicAddNine<<<B, T, n*sizeof(int)>>>(d_d, n);
        gpuErrChk(cudaPeekAtLastError());
        gpuErrChk(cudaDeviceSynchronize());

        gpuErrChk(cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost));

        for (int i = 0; i < n; i++) {
                if (d[i] != r[i]) printf("Error: d[%d] != r[%d] (%d, %d)\n", i, i, d[i], r[i]);
        }
}

cuda内核中的动态共享内存

0 个答案: