我在Cuda Documentaion中读到,在每个块内部线程都是在一组32个被称为warp的执行中,每个线程指向相同的指令但是可以访问多个数据,我的任务是测试语句的真实性
现在我做的是我发布了一个256线程和一个块的内核,所以8批次 必须执行warp。
我将创建一个大小为32的共享变量,将其分配给
sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;
然后将该变量分配给256字节长度的全局变量:
outputPointer[ threadIdx.x ] = sharedVariable [ threadIdx.x % 32 ];
理想情况下,根据假设我应该输出
0,0,0,0,0,0,0,0,直至32 1,1,1,1,1,12 32 .. 2,2,2,2,2,直到32
但我的输出仅为4,4,4,4,4
Cuda Code:
__global__ void addKernel(int *inputPointer, int *outputPointer)
{
__shared__ int sharedVariable[ 32 ];
sharedVariable [ threadIdx.x % 32 ] = 0 ;
sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;
outputPointer[ threadIdx.x ] = sharedVariable [ threadIdx.x % 32 ];
}
int main () {
......
addKernel<<<1, 256>>>(device_inputPointer, device_outputPointer);
......
/**Print output here */
//I am getting 4 ,4,4,4,4,4,4,4,4 as output
}
完整代码:
#include "cuda_runtime.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <conio.h>
#define SIZE 256 * sizeof(int)
__global__ void addKernel(int *inputPointer, int *outputPointer)
{
__shared__ int sharedVariable[ 32 ];
sharedVariable [ threadIdx.x % 32 ] = 0;
sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;
outputPointer[ threadIdx.x ] = sharedVariable [ threadIdx.x % 32 ];
}
int main()
{
// Copy input vectors from host memory to GPU buffers.
int *inputPointer = (int * ) malloc (SIZE);
int *outputPointer= (int * ) malloc (SIZE);
int *device_inputPointer;
int *device_outputPointer;
cudaMalloc((void**)&device_inputPointer, SIZE);
cudaMalloc((void**)&device_outputPointer, SIZE);
memset (inputPointer , 0 , SIZE);
cudaMemcpy(device_inputPointer , inputPointer, SIZE , cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, 256>>>(device_inputPointer, device_outputPointer);
cudaMemcpy(outputPointer, device_outputPointer, SIZE , cudaMemcpyDeviceToHost);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
for ( int i = 0 ; i < 256 ; i ++ ) {
printf ( " %d " , outputPointer[i] );
}
cudaDeviceReset();
getch();
return 0;
}
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <conio.h>
#define SIZE 256 * sizeof(int)
__global__ void addKernel(int *inputPointer, int *outputPointer)
{
__shared__ int sharedVariable[ 32 ];
sharedVariable [ threadIdx.x % 32 ] = 0;
sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;
outputPointer[ threadIdx.x ] = sharedVariable [ threadIdx.x % 32 ];
}
int main()
{
// Copy input vectors from host memory to GPU buffers.
int *inputPointer = (int * ) malloc (SIZE);
int *outputPointer= (int * ) malloc (SIZE);
int *device_inputPointer;
int *device_outputPointer;
cudaMalloc((void**)&device_inputPointer, SIZE);
cudaMalloc((void**)&device_outputPointer, SIZE);
memset (inputPointer , 0 , SIZE);
cudaMemcpy(device_inputPointer , inputPointer, SIZE , cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, 256>>>(device_inputPointer, device_outputPointer);
cudaMemcpy(outputPointer, device_outputPointer, SIZE , cudaMemcpyDeviceToHost);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
for ( int i = 0 ; i < 256 ; i ++ ) {
printf ( " %d " , outputPointer[i] );
}
cudaDeviceReset();
getch();
return 0;
}
我在不同的硬件上测试过它 在K20(特斯拉建筑,它工作正常)
答案 0 :(得分:2)
代码在以下行中具有未定义的行为:
sharedVariable [ threadIdx.x % 32 ] = 0;
sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;
多个线程可以具有相同的threadIdx.x % 32
值,并且这些线程会尝试同时写入相同的共享内存位置。这将导致这些线程之间的竞争条件。
例如,考虑threadIdx.x
0,32,64,96等的线程。所有这些线程都将尝试访问0
的索引sharedVariable
,从而导致未定义的行为。所有线程彼此偏移32
的情况也是如此(仅在当前示例中)。
答案 1 :(得分:2)
我认为你并不完全清楚cuda代码是如何并行执行的。
第sharedVariable [ threadIdx.x % 32 ] = 0 ;
行完全没用,因为在下一行中,您使用threadIdx.x /32
覆盖它。
此外,您对经线计数的假设是错误的。您的threadblock包含256个线程。所以有8个经线(256/32)。
您不能指望您的代码有任何结果,因为没有明确的行为! threadIdx.x /32
的结果将在0到7的范围内,具体取决于范围为0..255的threadIdx.x
。
因为每32个线程有8个warp,所以将有8个写入sharedVariable [ threadIdx.x % 32 ]
并且你没有控制权,warp将首先执行并且最后执行。
在你的情况下,warp 4最后被执行,因此你的结果只有4个。
要获得您期望从内核获得的结果,可以将其更改为:
__global__ void addKernel(int *outputPointer)
{
outputPointer[ threadIdx.x ] = threadIdx.x /32;
}
我认为没有机会像你想的那样使用共享内存。