我有一个简单的内核应该计算b数组的乘法和,但__syncthreads()似乎根本不工作,我调试它,temp [i]返回一些元素的未初始化值。如果我省略__syncthreads(),结果是相同的。 (我查了一下cuda代码的所有其他部分(比如数组的初始化,复制到内存等)都写得很好,所以问题出现在这个内核中)(注意:我不想使用atomicAdd)
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define MINREAL -1024.0
#define MAXREAL 1024.0
#define ACCURACY 0.01
#define NUM_OF_GPU_THREADS 256
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err){
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
void vecFillRand(int N, float *vec) {
int i;
for(i = 0; i < N; i++)
vec[i] = (rand() / (float) RAND_MAX)*(MAXREAL - MINREAL) + MINREAL;
}
float seq_dotProduct(float *a, float *b, int n) {
int i;
float dp;
dp = 0;
for(i = 0; i < n; i++) {
dp += a[i] * b[i];
}
return dp;
}
// krenel
__global__ void dotProduct(float *a, float *b, float *c) {
__shared__ float temp[NUM_OF_GPU_THREADS];
int idx = threadIdx.x + blockIdx.x * blockDim.x;
float t;
if (idx >= gridDim.x * blockDim.x)
return;
temp[threadIdx.x] = a[idx] * b[idx];
__syncthreads();
if(threadIdx.x == 0) {
c[blockIdx.x] = 0.0f;
for(int i = 0; i < NUM_OF_GPU_THREADS; i++){
t = temp[i];
c[blockIdx.x] = c[blockIdx.x] + t;
}
}
}
int main(int argc, char* argv[]) {
int i, n, ARRAY_BYTES;
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
float sum;
float seq_sum;
clock_t t;
srand(time(NULL));
if (argc == 2) {
n = atoi(argv[1]);
} else {
printf("N? ");
fflush(stdout);
scanf("%d", &n);
}
int BLOCKS_PER_GRID = (unsigned int)ceil(n/(float)NUM_OF_GPU_THREADS);
// arrays n host
ARRAY_BYTES = n * sizeof(float);
h_A = (float *) malloc(ARRAY_BYTES);
h_B = (float *) malloc(ARRAY_BYTES);
h_C = (float *) malloc(BLOCKS_PER_GRID * sizeof(float));
printf("\ncreating A and B...\n\n");
vecFillRand(n, h_A);
vecFillRand(n, h_B);
vecFillRand(BLOCKS_PER_GRID, h_C);
// arrays on device
cudaMalloc((void**) &d_A, ARRAY_BYTES);
cudaMalloc((void**) &d_B, ARRAY_BYTES);
cudaMalloc((void**) &d_C, BLOCKS_PER_GRID * sizeof(float));
// transfer the arrays to the GPU
cudaMemcpy(d_A, h_A, ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, BLOCKS_PER_GRID, cudaMemcpyHostToDevice);
// TIME START
// create events for timing execution
cudaEvent_t start = cudaEvent_t();
cudaEvent_t stop = cudaEvent_t();
cudaEventCreate( &start );
cudaEventCreate( &stop );
// record time into start event
cudaEventRecord( start, 0 ); // 0 is the default stream id
// launch the kernel
dim3 block(NUM_OF_GPU_THREADS); // 256, 1, 1
dim3 grid(BLOCKS_PER_GRID);
printf ("computing dotProduct... \n");
dotProduct<<<grid, block>>>(d_A, d_B, d_C);
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
// TIME END
// record time into stop event
cudaEventRecord( stop, 0 );
// synchronize stop event to wait for end of kernel execution on stream 0
cudaEventSynchronize( stop );
// compute elapsed time (done by CUDA run-time)
float elapsed_kernel = 0.f;
cudaEventElapsedTime( &elapsed_kernel, start, stop );
// release events
cudaEventDestroy( start );
cudaEventDestroy( stop );
// print krenel time
printf("CUDA TIME: %f \n\n", elapsed_kernel/1000);
// copy back the result array to the CPU
cudaMemcpy(h_C, d_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyDeviceToHost );
// Check for any CUDA errors
checkCUDAError("memcpy");
// compute sum
sum = 0;
for (i = 0; i < BLOCKS_PER_GRID; i++)
sum += h_C[i];
// launch sequential
t = clock();
printf ("computing seq_dotProduct... \n");
seq_sum = seq_dotProduct(h_A, h_B, n);
t = clock() - t;
printf ("SEQ TIME: %f \n\n",((float)t)/CLOCKS_PER_SEC);
// check sum and seq_sum
float value = abs(sum - seq_sum);
if (value > ACCURACY) {
printf("Test FAILED: %f \n", value);
}
else{
printf("Test PASSED \n");
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
return 0;
}
答案 0 :(得分:3)
您的内核需要稍微重写。
&#34;最后一个块&#34;将终止任何超过输入向量长度的线程,但对最后一个块的元素求和的for循环没有正确检查以确保它不超过向量的长度。这导致越界读取访问,当您使用cuda-memcheck运行代码时(并假设您输入的矢量大小不是256的倍数),这将显示出来。
此外,__syncthreads()
不应在条件代码中使用,除非条件在块中的所有线程上对其进行评估。对于矢量长度不是256的倍数,您的最后一个块将违反此规则。
除此之外,对于较大的矢量大小,您需要在float
数量之外预期太多(太多位数)精度。您需要根据要汇总的浮点数来缩放ACCURACY
测试。
这是您的代码的修改版本,其中包含我上面提到的更改,这似乎对我有效:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
//#include "cuda_runtime.h"
//#include "device_launch_parameters.h"
#define MINREAL -1024.0
#define MAXREAL 1024.0
#define FAST_RED
#define ACCURACY 0.0001
#define NUM_OF_GPU_THREADS 256
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err){
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
void vecFillRand(int N, float *vec) {
int i;
for(i = 0; i < N; i++)
vec[i] = (rand() / (float) RAND_MAX)*(MAXREAL - MINREAL) + MINREAL;
}
float seq_dotProduct(float *a, float *b, int n) {
int i;
float dp;
dp = 0;
for(i = 0; i < n; i++) {
dp += a[i] * b[i];
}
return dp;
}
// krenel
__global__ void dotProduct(float *a, float *b, float *c, int n) {
__shared__ float temp[NUM_OF_GPU_THREADS];
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < n)
temp[threadIdx.x] = a[idx] * b[idx];
else temp[threadIdx.x] = 0.0f;
__syncthreads();
#ifdef FAST_RED
// assumes block dimension is a power of 2
for (int i = blockDim.x>>1; i > 0; i >>= 1){
if (threadIdx.x < i) temp[threadIdx.x] += temp[threadIdx.x+i];
__syncthreads();}
if (threadIdx.x == 0) c[blockIdx.x] = temp[0];
#else
float t;
if(threadIdx.x == 0) {
c[blockIdx.x] = 0.0f;
int j=0;
for(int i = blockIdx.x*blockDim.x; ((i < ((blockIdx.x+1)*blockDim.x)) && (i < n)); i++){
t = temp[j++];
c[blockIdx.x] = c[blockIdx.x] + t;
}
}
#endif
}
int main(int argc, char* argv[]) {
int i, n, ARRAY_BYTES;
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
float sum;
float seq_sum;
clock_t t;
srand(time(NULL));
if (argc == 2) {
n = atoi(argv[1]);
} else {
printf("N? ");
fflush(stdout);
scanf("%d", &n);
}
int BLOCKS_PER_GRID = (unsigned int)ceil(n/(float)NUM_OF_GPU_THREADS);
printf("bpg = %d\n", BLOCKS_PER_GRID);
// arrays n host
ARRAY_BYTES = n * sizeof(float);
h_A = (float *) malloc(ARRAY_BYTES);
h_B = (float *) malloc(ARRAY_BYTES);
h_C = (float *) malloc(BLOCKS_PER_GRID * sizeof(float));
printf("\ncreating A and B...\n\n");
vecFillRand(n, h_A);
vecFillRand(n, h_B);
vecFillRand(BLOCKS_PER_GRID, h_C);
// arrays on device
cudaMalloc((void**) &d_A, ARRAY_BYTES);
cudaMalloc((void**) &d_B, ARRAY_BYTES);
cudaMalloc((void**) &d_C, BLOCKS_PER_GRID * sizeof(float));
// transfer the arrays to the GPU
cudaMemcpy(d_A, h_A, ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyHostToDevice);
// TIME START
// create events for timing execution
cudaEvent_t start = cudaEvent_t();
cudaEvent_t stop = cudaEvent_t();
cudaEventCreate( &start );
cudaEventCreate( &stop );
// record time into start event
cudaEventRecord( start, 0 ); // 0 is the default stream id
// launch the kernel
dim3 block(NUM_OF_GPU_THREADS); // 256, 1, 1
dim3 grid(BLOCKS_PER_GRID);
printf ("computing dotProduct... \n");
dotProduct<<<grid, block>>>(d_A, d_B, d_C, n);
// block until the device has completed
cudaDeviceSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
// TIME END
// record time into stop event
cudaEventRecord( stop, 0 );
// synchronize stop event to wait for end of kernel execution on stream 0
cudaEventSynchronize( stop );
// compute elapsed time (done by CUDA run-time)
float elapsed_kernel = 0.f;
cudaEventElapsedTime( &elapsed_kernel, start, stop );
// release events
cudaEventDestroy( start );
cudaEventDestroy( stop );
// print krenel time
printf("CUDA TIME: %f \n\n", elapsed_kernel/1000);
// copy back the result array to the CPU
cudaMemcpy(h_C, d_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyDeviceToHost );
// Check for any CUDA errors
checkCUDAError("memcpy");
// compute sum
sum = 0;
for (i = 0; i < BLOCKS_PER_GRID; i++)
sum += h_C[i];
// launch sequential
t = clock();
printf ("computing seq_dotProduct... \n");
seq_sum = seq_dotProduct(h_A, h_B, n);
t = clock() - t;
printf ("SEQ TIME: %f \n\n",((float)t)/CLOCKS_PER_SEC);
// check sum and seq_sum
float value = abs((sum - seq_sum)/sum);
if (value > ACCURACY) {
printf("Test FAILED: err: %f cpu: %f gpu: %f \n", value, seq_sum, sum);
}
else{
printf("Test PASSED \n");
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
return 0;
}
编辑:在回答下面的问题时,我修改了代码以证明基本的并行缩减。