我试图更好地了解cuda内核中的动态共享内存。我复制了此example并对其进行了修改,以允许我更改工作项的数量。它适用于<= 1024,更大的元素,并且结果数组d中的所有元素均为0。此外,它报告以下错误“遇到非法内存访问”。我很茫然,无法找到错误。对于我做错的任何帮助/解释将不胜感激。谢谢。
代码在讨论过程中已更新
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void dynamicAddNine(int *d, int n) {
extern __shared__ int s[];
int t = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
s[t] = d[t] + 9;
__syncthreads();
d[t] = s[t];
}
static void parseArgs(int argc, char **argv) {
if (argc != 2) {
printf("Usage : %s <N>, where N (+ve int) is the required array size\n", argv[0]);
exit(1);
}
if (atoi(argv[1]) < 0 ) {
printf("Usage : %s <N>, where N (+ve int) is the required array size\n", argv[0]);
exit(1);
}
}
int main(int argc, char *argv[]) {
parseArgs(argc, argv);
const int n = atoi(argv[1]);
int T = 1024; // maximum number of threads
if (n < 1024) { T = n;} // if no elements required < 1024
int a[n], r[n], d[n];
for (int i = 0; i < n; i++) {
a[i] = i;
r[i] = i+9;
d[i] = 0;
}
int *d_d;
gpuErrChk(cudaMalloc(&d_d, n*sizeof(int)));
int B = (int)ceil((float)n/(float)T); //calculate no of blocks required
// run version with dynamic shared memory
std::cout << "B: " << B << ", T: " << T << std::endl;
gpuErrChk(cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice));
dynamicAddNine<<<B, T, n*sizeof(int)>>>(d_d, n);
gpuErrChk(cudaPeekAtLastError());
gpuErrChk(cudaDeviceSynchronize());
gpuErrChk(cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i++) {
if (d[i] != r[i]) printf("Error: d[%d] != r[%d] (%d, %d)\n", i, i, d[i], r[i]);
}
}