该示例代码基于“ cuda”的示例。可以在/NVIDIA_CUDA-9.1_Samples/0_Simple/vectorAdd/vectorAdd.cu
简而言之,我删除了所有插图,希望不会给您带来麻烦。以下是我的代码。
#include <stdio.h>
__global__ void
vectorAdd( const float *A, const float *B, float *C, int numElements )
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if ( i < numElements )
{
C[i] = A[i] + B[i];
}
printf( "%d\n", i );
}
int main( void )
{
int numElements = 50000;
size_t size = numElements * sizeof( float );
float *h_A = ( float * ) malloc( size );
float *h_B = ( float * ) malloc( size );
float *h_C = ( float * ) malloc( size );
for ( int i = 0; i < numElements; ++i )
{
h_A[i] = rand( ) / ( float ) RAND_MAX;
h_B[i] = rand( ) / ( float ) RAND_MAX;
}
float *d_A = NULL;
cudaMalloc( ( void ** ) &d_A, size );
float *d_B = NULL;
cudaMalloc( ( void ** ) &d_B, size );
float *d_C = NULL;
cudaMalloc( ( void ** ) &d_C, size );
cudaMemcpy( d_A, h_A, size, cudaMemcpyHostToDevice );
cudaMemcpy( d_B, h_B, size, cudaMemcpyHostToDevice );
int threadsPerBlock = 256;
int blocksPerGrid = ( numElements + threadsPerBlock - 1 ) / threadsPerBlock;
printf( "CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock );
vectorAdd <<< blocksPerGrid, threadsPerBlock >>> ( d_A, d_B, d_C, numElements );
cudaMemcpy( h_C, d_C, size, cudaMemcpyDeviceToHost );
cudaFree( d_A );
cudaFree( d_B );
cudaFree( d_C );
free( h_A );
free( h_B );
free( h_C );
printf( "Done\n" );
return 0;
}
printf
的结果只有4096行,而numElements
的数目却没有50000行,并且每次运行都具有不同的范围。
为什么会这样?如何获得正确的printf
(50000行,从0〜49999)?
感谢您的任何评论和答案。